Model validation: A simple example

Type of model validation

  • "R2 is not enough"
  • Looking at the residals
  • Cross validation when you can

R2 and graphical methods

In [1]:
import numpy as np
import scipy as sp
import scipy.linalg as lin
import matplotlib.pyplot as plt
from __future__ import division, print_function
In [2]:
# function to compute the R2 from Y and X
# from scipy import linalg as lin

def projR(to_proj, X_):
    """ 
    projects the 2d array `to_proj` onto the residual space of `_X`
    
    The svd of _X is computed to have a orthonormal basis of the
    space of the columns of _X
    """
    u_, _, _ = lin.svd(X_, full_matrices=False)
    return (to_proj - u_.dot(u_.T.dot(to_proj)))

## try this function with the mean:
# ones_n1 = np.ones((n,1))
# y = np.random.normal(3,1,size=(n,1))
# print("this should be close to 3: ", y.mean())
# print("this should be close to 0: ", projR(y, ones_n1).mean())


def R2(Y, X, rm_intercept=False):
    """
    Compute the coefficient of determination R2 
    Y: numpy array shape (n,1) : the data
    X: numpy array shape (n,p) : the design matrix
    """
    (n,p) = X.shape
    SST = (Y**2).sum()               # total sum of square
    C = (Y.sum())**2 / n             # sum of square of the mean
    ones_n1 = np.ones((n,1))
    Xdemeaned = projR(X, ones_n1)
    # compute the svd to get an orthonormal basis 
    u_, _, _ = lin.svd(Xdemeaned, full_matrices=False)
    
    # compute SSReg noticing that Y^t u u^t Y = \sum (u^t Y)^2
    SSReg = ((u_.T.dot(Y))**2).sum()
    if rm_intercept:
        return SSReg / (SST - C)
    else: 
        return SSReg / SST


def SSR(Y, X):
    "compute the sum of square of the regression on X"
    (n_, p_) = X.shape
    if n_ < p_:
        raise ValueError("n must be >= to p")
    u_, _, _ = lin.svd(X, full_matrices=False)
    return (((u_.T * Y.T).sum(axis=1))**2).sum()

def Proj(X):
    """ This function returns the projector on the 
        space of the columns of `X`
    """
    (n_, p_) = X.shape
    if n_ < p_:
        raise ValueError("n must be >= to p")
    u_, _, _ = lin.svd(X, full_matrices=False)
    return u_.dot(u_.T)
In [3]:
# define a true model

def quadratic_model(x, param=[1,-8,16]):
    return param[0]*x**2 + param[1]*x + param[2]

model = quadratic_model
In [65]:
n = 16 # number of points
noise_level = 1./8
x = np.arange(n)
true_y = model(x)
noiseSD = (true_y.max() - true_y.min())*noise_level
y = true_y + np.random.normal(0, noiseSD, size=(n,)) 
plt.plot(x,y,x,true_y)
Out[65]:
[<matplotlib.lines.Line2D at 0x69ed8d0>,
 <matplotlib.lines.Line2D at 0x69edb10>]

Now, we would like to know if the data are linear or quadratic with x.

In [66]:
# fit a linear model y = XB + E
X = np.hstack((np.ones((n,1)),x.reshape((n,1))))
print(X)
[[  1.   0.]
 [  1.   1.]
 [  1.   2.]
 [  1.   3.]
 [  1.   4.]
 [  1.   5.]
 [  1.   6.]
 [  1.   7.]
 [  1.   8.]
 [  1.   9.]
 [  1.  10.]
 [  1.  11.]
 [  1.  12.]
 [  1.  13.]
 [  1.  14.]
 [  1.  15.]]
In [67]:
# fit the data
beta_h = lin.pinv(X).dot(y)
resid = y - X.dot(beta_h)
plt.plot(resid)
print(R2(y, X))
0.789569189143
In [68]:
X2 = np.hstack((X, (x*x).reshape((n,1))))
In [69]:
X2
Out[69]:
array([[   1.,    0.,    0.],
       [   1.,    1.,    1.],
       [   1.,    2.,    4.],
       [   1.,    3.,    9.],
       [   1.,    4.,   16.],
       [   1.,    5.,   25.],
       [   1.,    6.,   36.],
       [   1.,    7.,   49.],
       [   1.,    8.,   64.],
       [   1.,    9.,   81.],
       [   1.,   10.,  100.],
       [   1.,   11.,  121.],
       [   1.,   12.,  144.],
       [   1.,   13.,  169.],
       [   1.,   14.,  196.],
       [   1.,   15.,  225.]])
In [70]:
# fit the data
beta_h = lin.pinv(X2).dot(y)
resid = y - X2.dot(beta_h)
plt.plot(resid)
print(R2(y, X2))
0.895987535281

Cross-validation

  1. pick a cross validation scheme (here, leave one out)
  2. make test and train datasets
  3. fit the model on the train data
  4. measure how well is this model doing on the left out data (square prediction error)
  5. repeat across (test,train) and cumulate results
In [71]:
def make_test_train(y, lmodels, scheme='leave_one_out'):
    """
    Parameters
    ----------
    y : numpy array
        the data
    lmodels : list of numpy arrays
        each element is a design matrix
    Returns
    -------
    
    """    
    if scheme=='leave_one_out':
        for i, y_test in enumerate(y):
            y_train = np.hstack((y[:i],y[i+1:]))
            m_loo = []
            for mX in lmodels:
                m_loo.append(np.vstack((mX[:i,:],mX[i+1:,:])))
            yield i, y_test, y_train, m_loo
In [72]:
n = len(y)
x = np.arange(n)
X1 = np.hstack((np.ones((n,1)),x.reshape((n,1))))
X2 = np.hstack((X1, (x*x).reshape((n,1))))
models_list = [X1, X2]

for k in range(2,8):
    models_list.append(np.hstack((X2, np.random.normal(size=(n,k)))))
In [73]:
y_split = make_test_train(y, models_list)

predicted_error = np.zeros(len(models_list),dtype='float')

for i, y_test, y_train, models_train in y_split:
    for midx, md in enumerate(models_train):
        # fit the model on train
        bh = lin.pinv(md).dot(y_train)
        # compute the difference between y_test and predicted
        predicted_error[midx] += (y_test - models_list[midx][i,:].dot(bh))**2

print(predicted_error/n)
plt.plot(predicted_error/n)  
[ 1216.81644631   419.52053563   520.91121718   551.83232086   851.68596693
   800.49557814   843.35431674  1096.09708688]
Out[73]:
[<matplotlib.lines.Line2D at 0x6c531d0>]

References

In [ ]: