Model validation: A simple example¶

Type of model validation¶

"R2 is not enough"
Looking at the residals
Cross validation when you can

R2 and graphical methods¶

import numpy as np
import scipy as sp
import scipy.linalg as lin
import matplotlib.pyplot as plt
from __future__ import division, print_function

# function to compute the R2 from Y and X
# from scipy import linalg as lin

def projR(to_proj, X_):
    """ 
    projects the 2d array `to_proj` onto the residual space of `_X`
    
    The svd of _X is computed to have a orthonormal basis of the
    space of the columns of _X
    """
    u_, _, _ = lin.svd(X_, full_matrices=False)
    return (to_proj - u_.dot(u_.T.dot(to_proj)))

## try this function with the mean:
# ones_n1 = np.ones((n,1))
# y = np.random.normal(3,1,size=(n,1))
# print("this should be close to 3: ", y.mean())
# print("this should be close to 0: ", projR(y, ones_n1).mean())


def R2(Y, X, rm_intercept=False):
    """
    Compute the coefficient of determination R2 
    Y: numpy array shape (n,1) : the data
    X: numpy array shape (n,p) : the design matrix
    """
    (n,p) = X.shape
    SST = (Y**2).sum()               # total sum of square
    C = (Y.sum())**2 / n             # sum of square of the mean
    ones_n1 = np.ones((n,1))
    Xdemeaned = projR(X, ones_n1)
    # compute the svd to get an orthonormal basis 
    u_, _, _ = lin.svd(Xdemeaned, full_matrices=False)
    
    # compute SSReg noticing that Y^t u u^t Y = \sum (u^t Y)^2
    SSReg = ((u_.T.dot(Y))**2).sum()
    if rm_intercept:
        return SSReg / (SST - C)
    else: 
        return SSReg / SST


def SSR(Y, X):
    "compute the sum of square of the regression on X"
    (n_, p_) = X.shape
    if n_ < p_:
        raise ValueError("n must be >= to p")
    u_, _, _ = lin.svd(X, full_matrices=False)
    return (((u_.T * Y.T).sum(axis=1))**2).sum()

def Proj(X):
    """ This function returns the projector on the 
        space of the columns of `X`
    """
    (n_, p_) = X.shape
    if n_ < p_:
        raise ValueError("n must be >= to p")
    u_, _, _ = lin.svd(X, full_matrices=False)
    return u_.dot(u_.T)

# define a true model

def quadratic_model(x, param=[1,-8,16]):
    return param[0]*x**2 + param[1]*x + param[2]

model = quadratic_model

n = 16 # number of points
noise_level = 1./8
x = np.arange(n)
true_y = model(x)
noiseSD = (true_y.max() - true_y.min())*noise_level
y = true_y + np.random.normal(0, noiseSD, size=(n,)) 
plt.plot(x,y,x,true_y)

[<matplotlib.lines.Line2D at 0x69ed8d0>,
 <matplotlib.lines.Line2D at 0x69edb10>]

Now, we would like to know if the data are linear or quadratic with x.

# fit a linear model y = XB + E
X = np.hstack((np.ones((n,1)),x.reshape((n,1))))
print(X)

[[  1.   0.]
 [  1.   1.]
 [  1.   2.]
 [  1.   3.]
 [  1.   4.]
 [  1.   5.]
 [  1.   6.]
 [  1.   7.]
 [  1.   8.]
 [  1.   9.]
 [  1.  10.]
 [  1.  11.]
 [  1.  12.]
 [  1.  13.]
 [  1.  14.]
 [  1.  15.]]

# fit the data
beta_h = lin.pinv(X).dot(y)
resid = y - X.dot(beta_h)
plt.plot(resid)
print(R2(y, X))

0.789569189143

X2 = np.hstack((X, (x*x).reshape((n,1))))

X2

array([[   1.,    0.,    0.],
       [   1.,    1.,    1.],
       [   1.,    2.,    4.],
       [   1.,    3.,    9.],
       [   1.,    4.,   16.],
       [   1.,    5.,   25.],
       [   1.,    6.,   36.],
       [   1.,    7.,   49.],
       [   1.,    8.,   64.],
       [   1.,    9.,   81.],
       [   1.,   10.,  100.],
       [   1.,   11.,  121.],
       [   1.,   12.,  144.],
       [   1.,   13.,  169.],
       [   1.,   14.,  196.],
       [   1.,   15.,  225.]])

# fit the data
beta_h = lin.pinv(X2).dot(y)
resid = y - X2.dot(beta_h)
plt.plot(resid)
print(R2(y, X2))

0.895987535281

Cross-validation¶

pick a cross validation scheme (here, leave one out)
make test and train datasets
fit the model on the train data
measure how well is this model doing on the left out data (square prediction error)
repeat across (test,train) and cumulate results

def make_test_train(y, lmodels, scheme='leave_one_out'):
    """
    Parameters
    ----------
    y : numpy array
        the data
    lmodels : list of numpy arrays
        each element is a design matrix
    Returns
    -------
    
    """    
    if scheme=='leave_one_out':
        for i, y_test in enumerate(y):
            y_train = np.hstack((y[:i],y[i+1:]))
            m_loo = []
            for mX in lmodels:
                m_loo.append(np.vstack((mX[:i,:],mX[i+1:,:])))
            yield i, y_test, y_train, m_loo

n = len(y)
x = np.arange(n)
X1 = np.hstack((np.ones((n,1)),x.reshape((n,1))))
X2 = np.hstack((X1, (x*x).reshape((n,1))))
models_list = [X1, X2]

for k in range(2,8):
    models_list.append(np.hstack((X2, np.random.normal(size=(n,k)))))

y_split = make_test_train(y, models_list)

predicted_error = np.zeros(len(models_list),dtype='float')

for i, y_test, y_train, models_train in y_split:
    for midx, md in enumerate(models_train):
        # fit the model on train
        bh = lin.pinv(md).dot(y_train)
        # compute the difference between y_test and predicted
        predicted_error[midx] += (y_test - models_list[midx][i,:].dot(bh))**2

print(predicted_error/n)
plt.plot(predicted_error/n)

[ 1216.81644631   419.52053563   520.91121718   551.83232086   851.68596693
   800.49557814   843.35431674  1096.09708688]

[<matplotlib.lines.Line2D at 0x6c531d0>]

References¶

Cross validation: http://en.wikipedia.org/wiki/Cross-validation_%28statistics%29

Coefficient of determination: http://en.wikipedia.org/wiki/Coefficient_of_determination

Testing

Model checking / specification in fMRI: