Source code for mlearn.algorithms.linreg

"""This module contains simple functions for linear regressions."""

import numpy as np
import transform

[docs]def sum_of_squared_residuals(x, y, beta):
    """
    Calculate the sum of squared residuals for observed :math:`y` values,
    given :math:`x` predictor matrix and provided :math:`\\beta` parameters.

    Parameters
    ----------
    x : np.array
        Corresponds to a matrix :math:`x` which contains all :math:`x_i` values
        (including :math:`x_0=1`).
    y : np.array
        Corresponds to vector :math:`y` which refers to the observed values.
    beta : np.array
        Corresponds to vector :math:`\\beta` which contains the parameters to
        calculate the predicted values.

    Returns
    -------
    ssr : numpy.array

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Residual_sum_of_squares

    Notes
    -----
    .. math:: SSR(\\beta) = \\displaystyle\\sum_{i=1}^{n}(x_i\\beta-y_i)^2
    .. math:: SSR(\\beta) = (x\\beta-y)^T (x\\beta-y)

    Examples
    --------
    >>> # The predictor matrix
    >>> x = np.array([[1, 11, 104],
                      [1, 15, 99],
                      [1, 22, 89],
                      [1, 27, 88]])
    >>> # The observed values vector
    >>> y = np.array([[12],
                      [15],
                      [19],
                      [22]])
    >>> # The parameters vector
    >>> beta_zero = np.zeros((3,1))
    >>> sum_of_squared_residuals(x, y, beta_zero)
    [ 1214.]

    """

    diff_score = np.dot(x, beta) - y
    ssr = np.dot(diff_score.transpose(), diff_score)

    return ssr.ravel()

[docs]def gradient_descent(x, y, beta_init=None, gamma=0.01, max_iter=200,
                     threshold=0.01, scaling=True, regularize=False):
    """
    Numerically estimates the unknown parameters :math:`\\beta_i` for a linear
    regression model where :math:`x` refers to the predictor matrix and and
    :math:`y` to the observed values vector.

    The first derivative of the sum of the squared residuals is used to
    calculate the gradient for each parameter. In every iteration, the parameter
    is changed in decreasing direction of the gradient with the given step size
    :math:`\\gamma`. This is done until the maximum iteration amount is reached
    or the difference of sum of squared residuals between two iterations falls
    below a given threshold.

    Parameters
    ----------
    x : np.array
        Corresponds to a matrix :math:`x` which contains all :math:`x_i` values
        (including :math:`x_0=1`).
    y : np.array
        Corresponds to vector :math:`y` which refers to the observed values.
    beta_init : np.array, optional
        Initial :math:`\\beta_i` values may be provided. Otherwise they are set
        to zero.
    gamma : float, optional
        The step size :math:`\\gamma` of the gradient descent. Determines how
        much parameters change per iteration.
    max_iter : float, optional
        Sets the maximum number of iterations.
    threshold : float, optional
        Define the threshold for convergence. If the difference of  sum of the
        squared residuals between two consecutive iterations falls below this
        value, the gradient descent has converged and the function stops.
    scaling : boolean, optional
        By default, the predictors are z-transformed. This improves the gradient
        descent performance because all predictors behave on the same scale.
    regularize : float, optional
        Apply the regularization term :math:`\\lambda` to the estimation
        of :math:`\\beta`. It can prevent overfitting when :math:`x` contains a
        large number of higher order predictors. Increasing :math:`\\lambda`
        will decrease :math:`\\beta_i` values which causes the decision
        boundary to be smoother.

    Returns
    -------
    beta : numpy.array

    Notes
    -----
    .. math:: f(\\beta)=\\frac{1}{2n}\\
              \\displaystyle\\sum_{i=1}^{n}(x_i\\beta-y_i)^2 = \\
              \\frac{1}{2n}(x\\beta-y)^T (x\\beta-y)=\\frac{1}{2n}SSR(\\beta)
    .. math:: f'(\\beta)=\\beta-\\
              \\frac{\\gamma}{n}\\displaystyle\\sum_{i=1}^{n}\\
              (x_i\\beta_m-y_i) x_i = \\beta-\\gamma\\frac{1}{n}x^T(x\\beta-y)
    .. math:: f'_{reg}(\\beta)=\\beta(1-\\gamma \\frac{\\lambda}{n} \\
              \\beta_{reg})-\\frac{\\gamma}{n}x^T(x\\beta-y) \\text{ where } \\
              \\beta_{reg} = \\begin{bmatrix} 0 \\\\ 1 \\\\ \\vdots \\\\ 1_m \\
              \\end{bmatrix}

    References
    ----------
    [4] https://en.wikipedia.org/wiki/Gradient_descent

    """

    n, m = x.shape

    if scaling:
        x = transform.standardize(x[:,1:])
        x = np.append(np.ones((x.shape[0], 1)), x, axis=1)

    if not beta_init:
        beta_init = np.ones((m, 1))

    reg_term = np.ones((m,1))
    if regularize:
        beta_reg = np.ones((m,1))
        beta_reg[0,0] = 0
        reg_term = 1-(gamma*(regularize/float(n)) * beta_reg)

    beta = beta_init
    current_cost = None
    n_iter = 0

    while True:
        right_term = (gamma / float(n))
        right_term = right_term * np.dot(np.transpose(x), np.dot(x, beta) - y)

        left_term = beta * reg_term
        beta = left_term - right_term

        cost = sum_of_squared_residuals(x, y, beta)

        if current_cost is None:
            current_cost = cost
        elif np.abs(cost - current_cost) < threshold:
            break
        else:
            current_cost = cost

        if n_iter == max_iter:
            break
        else:
            n_iter += 1

    return beta


[docs]def ordinary_least_squares(x, y, regularize=False):
    """
    Analytically calculate the unknown parameters :math:`\\beta` for a linear
    regression model where :math:`x` refers to the predictor matrix and and
    :math:`y` to the observed values vector.

    Parameters
    ----------
    x : np.array
        Corresponds to a matrix :math:`x` which contains all :math:`x_i` values
        (including :math:`x_0=1`).
    y : np.array
        Corresponds to vector :math:`y` which refers to the observed values.
    regularize : float, optional
        Apply the regularization term :math:`\\lambda` to the estimation
        of :math:`\\beta`. It can prevent overfitting when :math:`x` contains a
        large number of higher order predictors. Increasing :math:`\\lambda`
        will decrease :math:`\\beta_i` values which causes the decision
        boundary to be smoother.

    Returns
    -------
    beta : numpy.array

    Notes
    -----
    .. math:: \\hat{\\beta} =  (x^Tx)^{-1}x^Ty
    .. math:: \\text{Regularization with m predictors: } \\hat{\\beta} = \\
              (x^Tx + \\lambda \\
              \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ 0 & 1 & 0 & 0 \\\\\\
              0 & 0 & \\ddots & \\vdots \\\\ 0 & 0 & \\cdots & 1_{m+1} \\
              \\end{bmatrix})\\
              ^{-1}x^Ty

    References
    ----------
    .. [2] https://en.wikipedia.org/wiki/Ordinary_least_squares
    .. [3] https://en.wikipedia.org/wiki/Regularization_%28mathematics%29

    Examples
    --------
    >>> # The predictor matrix
    >>> x = np.array([[1, 11, 104],
                      [1, 15, 99],
                      [1, 22, 89],
                      [1, 27, 88]])
    >>> # The observed values vector
    >>> y = np.array([[12],
                      [15],
                      [19],
                      [22]])
    >>> # The parameters vector
    >>> beta_zero = np.zeros((3,1))
    >>> sum_of_squared_residuals(x, y, beta_zero)
    [ 1214.]
    >>> beta_min = ordinary_least_squares(x, y)
    >>> sum_of_squared_residuals(x, y, beta_min)
    [ 0.14455509]

    """

    x_t = x.transpose()
    x_sum = np.dot(x_t, x)

    if regularize:
        lambda_identity = np.identity(x.shape[1])
        lambda_identity[0,0] = 0
        lambda_regularization = regularize * lambda_identity

        inverse = np.linalg.pinv(x_sum + lambda_regularization)
    else:
        inverse = np.linalg.pinv(x_sum)

    beta = np.dot(inverse, np.dot(x_t, y))

    return beta

if __name__ == "__main__":
    """
    x = np.array([[1, 11, 104],
                  [1, 15, 99],
                  [1, 22, 89],
                  [1, 27, 88]])

    x2 = np.array([[1, 11],
                   [1, 15],
                   [1, 22],
                   [1, 27]])

    y = np.array([[12],
                  [15],
                  [19],
                  [22]])

    beta_zero = np.zeros((2,1))

    print(sum_of_squared_residuals(x2, y, beta_zero))
    beta_min = ordinary_least_squares(x2, y)
    print(sum_of_squared_residuals(x2, y, beta_min))

    beta_grad = gradient_descent(x2, y, scaling=False, gamma=0.001)
    print(sum_of_squared_residuals(x2, y, beta_grad))
    """

    from sklearn import datasets, linear_model
    diabetes = datasets.load_diabetes()

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(diabetes.data, diabetes.target)
    print(sum_of_squared_residuals(diabetes.data, diabetes.target, regr.coef_))

    x = np.append(np.ones((diabetes.data.shape[0], 1)), diabetes.data, axis=1)
    y = diabetes.target
    ols = ordinary_least_squares(x, y)
    print(sum_of_squared_residuals(x, y, ols))

    gd = gradient_descent(x,np.reshape(y,(442,1)))
    print(sum_of_squared_residuals(x, y, gd.ravel()))