SL3: Analysis of Prostate Cancer dataset – variable subset selection

Table of Contents

# data analysis and wrangling
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm
import itertools # itertools functions create iterators for efficient looping
                 # itertools.combinations(p,r) creates r-length tuples, in sorted order, no repeated elements
                 # ex. combinations('ABCD', 2) = [AB AC AD BC BD CD]
        

#from IPython.display import Image # to visualize images
#from tabulate import tabulate # to create tables

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/prostate-data/tab4.png
/kaggle/input/prostate-data/tab4-no-cap.png
/kaggle/input/prostate-data/tab3.png
/kaggle/input/prostate-data/prostate.data
/kaggle/input/prostate-data/tab.png
/kaggle/input/prostate-data/tab2.png

1. Open your kernel SL_EX2_ProstateCancer_Surname in Kaggle

2. Generate a copy called SL_EX3_SubsetSelection_Surname by the Fork button

Data acquisition #

# Load the Prostate Cancer dataset
data = pd.read_csv('../input/prostate-data/prostate.data',sep='\t')

# Save "train" and lpsa" columns into Pandas Series variables
train = data['train']
lpsa = data['lpsa']

# Drop "train" and lpsa" variable from data
data = data.drop(columns=['Unnamed: 0','lpsa','train'],axis=1)

Data pre-processing #

## X VARIABLE:
# Split the data in train and test sets
dataTrain = data.loc[train == 'T'] # Obviously, len(idx)==len(dataTrain) is True!
dataTest = data.loc[train == 'F']

# Rename these two variables as "predictorsTrain" and "predictorsTest"
predictorsTrain = dataTrain
predictorsTest = dataTest


## Y VARIABLE:
# Split the "lpsa" in train and test sets
lpsaTrain = lpsa.loc[train == 'T']
lpsaTest = lpsa.loc[train == 'F']

Standardization

\[\dfrac{predictors - predictorsMeans}{predictorsStd}\]

# Standardize "predictorsTrain"
predictorsTrainMeans = predictorsTrain.mean()
predictorsTrainStds = predictorsTrain.std()
predictorsTrain_std = (predictorsTrain - predictorsTrainMeans)/predictorsTrainStds # standardized variables of predictorTrain

# Standardize "predictorsTest" (using the mean and std of predictorsTrain, it's better!)
predictorsTest_std = (predictorsTest - predictorsTrainMeans)/predictorsTrainStds # standardized variables of predictorTest

Split into Training and Test sets

## TRAINING SET
X_train = predictorsTrain_std
Y_train = lpsaTrain

## TEST SET
X_test = predictorsTest_std
Y_test = lpsaTest

Useful functions for the analysis of the Prostate_Cancer_dataset #

For Linear Regression

# Useful functions in order to compute RSS, R_squared and Zscore.
def LinReg(X_train,Y_train,X_test,Y_test):
    
    # Create the linear model, fitting also the intecept (non-zero)
    model = LinearRegression(fit_intercept = True)
    
    # Train the model on training set
    model.fit(X_train,Y_train)
    
    # Stats on Training set
    Y_train_pred = model.predict(X_train)
    RSS_train = mean_squared_error(Y_train,Y_train_pred) * len(Y_train)
    R2_train = model.score(X_train,Y_train)
    
    # Stats on Test set
    Y_test_pred = model.predict(X_test)
    RSS_test = mean_squared_error(Y_test,Y_test_pred) * len(Y_test)
    R2_test = model.score(X_test,Y_test)
    
    return RSS_train, RSS_test, R2_train

def Zscore(X_train,Y_train):
    
    # fitting the model
    model = sm.OLS(Y_train, sm.add_constant(X_train)).fit()
    
    Zscores = model.tvalues[1:] # we don't want const
    min_Zscore = min(abs(Zscores))
    idx_min_Zscore = abs(Zscores).idxmin() # it's the nearest to zero, so the variable less significant!

    return Zscores, min_Zscore, idx_min_Zscore

To print some info

## Print information about an iteration forward subset selection with subset size ncomb
# ncomb = number of the iteration
# features = remaining_features, selected_features (used to do the iteration and take track of the selection)
# params = all_RSS, all_R_squared, all_combs
# results = best_RSS, best_feature (results of the selection)
# detailed = parameter to see more detail about each single combination
def get_info_forwardS(ncomb,features,params,results,detailed):
    
    sepComb = "==="*30
    sepIter = "---"*30

    remaining, selected = features
    bestFeat, bestRSS = results
    
    print(f"{sepComb}\nIter n.{ncomb}:\n\
Choose {ncomb}-length combinations of the remaining variables\n\n\
Remaining features: {remaining}\n\
Features selected: {selected}")

    if detailed == 1:
        RSS, R_squared, Combs = params
        
        for niter in range(0,len(Combs)):
            var0 = Combs[niter]
            var1 = RSS[niter] 
            var2 = R_squared[niter]
            print(f"\nComb n.{niter+1}: {var0}\n\
{sepIter}\n\
RSS test: {var1}\n\
R_squared: {var2}\
")
        
    print(f"\nSelected variables: {bestFeat}\n\
min RSS: {bestRSS}\n\
{sepComb}\n")

    return


## Print information about an iteration backward subset selection with subset size ncomb
# ncomb = number of the iteration
# features = [remaining_features,dropped_features_list] (used to do the iteration and take track of the selection)
# params = [all_RSS,all_R_squared,all_combs]
# results = [best_RSS,dropped_feature]
# detailed = parameter to see more detail about each single combination
def get_info_backwardS(ncomb,features,params,results,detailed):
    
    sepComb = "==="*30
    sepIter = "---"*30

    remaining, dropped = features
    droppedFeat, bestRSS = results

    print(f"{sepComb}\nIter n.{8 - ncomb}:\n\n\
At the beginning we have:\n\
Remaining features: {remaining}\n\
Dropped features: {dropped}\n\n\
Now we compare the model selecting {ncomb} variables")

    if detailed == 1:
        RSS, R_squared, Combs = params
        
        for niter in range(0,len(Combs)):
            var0 = Combs[niter]
            var1 = RSS[niter] 
            var2 = R_squared[niter]
            print(f"\n\nComb n.{niter+1}: {var0}\n\
{sepIter}\n\
candidate dropped feature: {list(set(remaining)-set(var0))}\n\
RSS test: {var1}\
")

    print(f"\n\nAt the end we have:\n\
min RSS: {bestRSS}\n\
We drop: {droppedFeat}\n\
{sepComb}\n")
    
    return


## Print information about an iteration backward subset selection with subset size ncomb
# ncomb = number of the iteration
# features = [remaining_features,dropped_features_list] (used to do the iteration and take track of the selection)
# params = [all_RSS,all_R_squared,all_combs]
# results = [best_RSS,dropped_feature]
# detailed = parameter to see more detail about each single combination
def get_info_backwardS_Zscore(ncomb,features,params,results,detailed):
    
    sepComb = "==="*30
    sepIter = "---"*30

    remaining, dropped = features
    droppedFeat, bestRSS = results

    print(f"{sepComb}\nIter n.{8 - ncomb}:\n\n\
At the beginning we have:\n\
Remaining features: {remaining}\n\
Dropped features: {dropped}\n")

    print("\nThe Z-scores are:\n",Zscores)

    print(f"\n\nAt the end we have:\n\
min RSS: {bestRSS}\n\
We drop: {droppedFeat}\n\
{sepComb}\n")
    
    return

Best Subset Selection #

3. Starting from the `ols models` achieved in the last steps, perform best-subset selection.

Generate one model for each combination of the 8 variables available
For each model compute the RSS on training and test set, the number of variables and the \(R^2\) of the model
Save these numbers in suitable data structures

## range
variables = data.columns.tolist() # excluding 'const'

## Initialize the list where we temporarily store data
RSS_train_list, RSS_test_list, R_squared_list = [], [], []
numb_features, features_list = [], []

for k in range(1,len(variables) + 1):

#     niter = 0
#     print("---"*30,f"\nStart by choosing {k} variables\n")
    
    # Looping over all possible combinations of k variables
    for combo in itertools.combinations(variables,k):
        
#         niter = niter+1 
        
        # Compute all the statistics we need
        RSS_train, RSS_test, Rsquared_train = LinReg(X_train[list(combo)], Y_train, X_test[list(combo)], Y_test)
        
#         rnd = 4
#         print(f"{niter}. Variables: {list(combo)}\n\
#         RSS train: {RSS_train.round(rnd)}\n\
#         RSS test: {RSS_test.round(rnd)}\n\
#         R^2 train: {Rsquared_train.round(rnd)}\n")
        
        # Save the statistics
        RSS_train_list.append(RSS_train)
        RSS_test_list.append(RSS_test)
        R_squared_list.append(Rsquared_train)
        
        # Save features and number of features
        features_list.append(combo)
        numb_features.append(len(combo))   

#     print(f"\nUsing {k} variables we have computed {niter} models")
#     print("---"*30,"\n")

#Store in DataFrame
df_BestS = pd.DataFrame({'numb_features': numb_features,\
                         'RSS_train': RSS_train_list,\
                         'RSS_test': RSS_test_list,\
                         'R_squared': R_squared_list,\
                         'features': features_list})

df_BestS

	numb_features	RSS_train	RSS_test	R_squared	features
0	1	44.528583	14.392162	0.537516	(lcavol,)
1	1	73.613540	30.402846	0.235434	(lweight,)
2	1	91.292039	33.846748	0.051821	(age,)
3	1	89.624912	35.298771	0.069136	(lbph,)
4	1	66.422403	20.632078	0.310122	(svi,)
...	...	...	...	...	...
250	7	31.570706	14.702112	0.672100	(lcavol, lweight, age, svi, lcp, gleason, pgg45)
251	7	30.414990	17.034552	0.684103	(lcavol, lweight, lbph, svi, lcp, gleason, pgg45)
252	7	33.265433	16.754443	0.654498	(lcavol, age, lbph, svi, lcp, gleason, pgg45)
253	7	44.036622	22.633329	0.542626	(lweight, age, lbph, svi, lcp, gleason, pgg45)
254	8	29.426384	15.638220	0.694371	(lcavol, lweight, age, lbph, svi, lcp, gleason...

255 rows × 5 columns

Find the best subset (of variables) for each number of features

Now our dataframe df_BestS has a row for each model computed, and it’s not easy to handle. For this reason, we extract the best model for each number of variables by observing the interesting parameter.
We consider as interesting parameter:

minimum RSS train
minimum RSS test
maximum \(R^2\)

# Create new df, selection only best subsets of variables (based on RSS and R^2)
df_BestS_RSS_train= df_BestS[df_BestS.groupby('numb_features')['RSS_train'].transform(min) == df_BestS['RSS_train']]
df_BestS_RSS_test = df_BestS[df_BestS.groupby('numb_features')['RSS_test'].transform(min) == df_BestS['RSS_test']]
df_BestS_R_squared = df_BestS[df_BestS.groupby('numb_features')['R_squared'].transform(max) == df_BestS['R_squared']]

df_BestS_RSS_train

	numb_features	RSS_train	RSS_test	R_squared	features
0	1	44.528583	14.392162	0.537516	(lcavol,)
8	2	37.091846	14.774470	0.614756	(lcavol, lweight)
38	3	34.907749	12.015924	0.637441	(lcavol, lweight, svi)
97	4	32.814995	13.689964	0.659176	(lcavol, lweight, lbph, svi)
174	5	32.069447	14.577726	0.666920	(lcavol, lweight, lbph, svi, pgg45)
229	6	30.539778	16.457800	0.682807	(lcavol, lweight, lbph, svi, lcp, pgg45)
247	7	29.437300	15.495405	0.694258	(lcavol, lweight, age, lbph, svi, lcp, pgg45)
254	8	29.426384	15.638220	0.694371	(lcavol, lweight, age, lbph, svi, lcp, gleason...

df_BestS_RSS_test

	numb_features	RSS_train	RSS_test	R_squared	features
0	1	44.528583	14.392162	0.537516	(lcavol,)
11	2	42.312584	11.583584	0.560532	(lcavol, svi)
52	3	42.267034	11.484038	0.561005	(lcavol, svi, gleason)
112	4	42.223638	11.612573	0.561456	(lcavol, age, svi, gleason)
167	5	34.170209	11.497692	0.645101	(lcavol, lweight, age, svi, gleason)
226	6	33.642783	12.009380	0.650579	(lcavol, lweight, age, svi, gleason, pgg45)
246	7	30.958630	13.492898	0.678457	(lcavol, lweight, age, lbph, svi, lcp, gleason)
254	8	29.426384	15.638220	0.694371	(lcavol, lweight, age, lbph, svi, lcp, gleason...

# the same as selecting min RSS on training set
df_BestS_R_squared

	numb_features	RSS_train	RSS_test	R_squared	features
0	1	44.528583	14.392162	0.537516	(lcavol,)
8	2	37.091846	14.774470	0.614756	(lcavol, lweight)
38	3	34.907749	12.015924	0.637441	(lcavol, lweight, svi)
97	4	32.814995	13.689964	0.659176	(lcavol, lweight, lbph, svi)
174	5	32.069447	14.577726	0.666920	(lcavol, lweight, lbph, svi, pgg45)
229	6	30.539778	16.457800	0.682807	(lcavol, lweight, lbph, svi, lcp, pgg45)
247	7	29.437300	15.495405	0.694258	(lcavol, lweight, age, lbph, svi, lcp, pgg45)
254	8	29.426384	15.638220	0.694371	(lcavol, lweight, age, lbph, svi, lcp, gleason...

Plots for Best Selection #

4. + 5. + 6. Generate some charts:

x-axis: the subset size
y-axis:
- 1. RSS for training set of all the models generated at step 3
- 1. \(R^2\) of all the models generated at step 3
- 1. RSS for test set of all the models generated at step 3

# Initialize the figure
width = 6
height = 6
nfig = 3
fig = plt.figure(figsize = (width*nfig,height))

# 1. RSS Training set plot
tmp_df1 = df_BestS;           # scatter plot
tmp_df2 = df_BestS_RSS_train; # plot the line of best values
ax1 = fig.add_subplot(1, nfig, 1)
ax1.scatter(tmp_df1.numb_features,tmp_df1.RSS_train, alpha = .2, color = 'darkblue');
ax1.set_xlabel('Subset Size k',fontsize=14);
ax1.set_ylabel('RSS',fontsize=14);
ax1.set_title('RSS on training set',fontsize=18);
ax1.plot(tmp_df2.numb_features,tmp_df2.RSS_train,color = 'r', label = 'Best subset'); # line of best values
ax1.grid(color='grey', linestyle='-', linewidth=0.5);
ax1.legend();

# 2. RSS Test set plot
tmp_df1 = df_BestS;           # scatter plot
tmp_df2 = df_BestS_RSS_test;  # plot the line of best values
ax2 = fig.add_subplot(1, nfig, 2);
ax2.scatter(tmp_df1.numb_features,tmp_df1.RSS_test, alpha = .2, color = 'darkblue');
ax2.set_xlabel('Subset Size k',fontsize=14);
ax2.set_ylabel('RSS',fontsize=14);
ax2.set_title('RSS on test set',fontsize=18);
ax2.plot(tmp_df2.numb_features,tmp_df2.RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax2.grid(color='grey', linestyle='-', linewidth=0.5);
ax2.legend();

# 3. R^2 plot
tmp_df1 = df_BestS;           # scatter plot
tmp_df2 = df_BestS_R_squared;  # plot the line of best values
ax3 = fig.add_subplot(1, nfig, 3);
ax3.scatter(tmp_df1.numb_features,tmp_df1.R_squared, alpha = .2, color = 'darkblue');
ax3.set_xlabel('Subset Size k',fontsize=14);
ax3.set_ylabel('$R^2$',fontsize=14);
ax3.set_ylim(bottom=-0.1,top=1.1)
ax3.set_title('$R^2$ on training set',fontsize=18);
ax3.plot(tmp_df2.numb_features,tmp_df2.R_squared,color = 'r', label = 'Best subset'); # line of best values
ax3.grid(color='grey', linestyle='-', linewidth=0.5);
ax3.legend();

fig.suptitle('Best Subset Selection',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.8)
plt.show();

Estimating test error by adjusting training error #

In general, the training set MSE underestimates the test MSE. In fact we fit a model to the training data using least squares and we estimate the coefficients of the regression in such a way the training RSS is minimized. So the training RSS decreases as we add more variables to the model, but the test RSS may not!

For this reason the training RSS (and so also \(R^2\)) may not be used directly for selecting the best model, but we need to adjust them to get an estimate of the test error.

Attempt n.1: using sklearn library

AIC_list, BIC_list, R_squared_adj_list =[], [], [] # from sklearn I can get no Cp estimate out of the box.
best_features = df_BestS_RSS_train['features']

for features in best_features:
    
    regr = sm.OLS(Y_train, sm.add_constant(X_train[list(features)])).fit()
        
    AIC_list.append(regr.aic)
    BIC_list.append(regr.bic)
    R_squared_adj_list.append(regr.rsquared_adj)

#Store in DataFrame
df1 = pd.DataFrame({'numb_features': df_BestS_RSS_train['numb_features'],\
                    'RSS_train': df_BestS_RSS_train['RSS_train'],\
                    'features': df_BestS_RSS_train['features'],\
                    'AIC': AIC_list,\
                   'BIC': BIC_list,\
                   'R_squared_adj': R_squared_adj_list})

df1

	numb_features	RSS_train	features	AIC	BIC	R_squared_adj
0	1	44.528583	(lcavol,)	166.764154	171.173540	0.530401
8	2	37.091846	(lcavol, lweight)	156.520967	163.135045	0.602717
38	3	34.907749	(lcavol, lweight, svi)	154.454850	163.273620	0.620176
97	4	32.814995	(lcavol, lweight, lbph, svi)	152.312691	163.336154	0.637188
174	5	32.069447	(lcavol, lweight, lbph, svi, pgg45)	152.772911	166.001067	0.639618
229	6	30.539778	(lcavol, lweight, lbph, svi, lcp, pgg45)	151.498370	166.931219	0.651088
247	7	29.437300	(lcavol, lweight, age, lbph, svi, lcp, pgg45)	151.034951	168.672492	0.657983
254	8	29.426384	(lcavol, lweight, age, lbph, svi, lcp, gleason...	153.010102	172.852336	0.652215

# Initialize the figure
width = 6
height = 6
nfig = 3
fig = plt.figure(figsize = (width*nfig,height))

# 1. AIC
ax1 = fig.add_subplot(1, nfig, 1)
ax1.scatter(df1['numb_features'],df1['AIC'], alpha = .5, color = 'darkblue');
ax1.plot(df1['numb_features'],df1['AIC'],color = 'r', alpha = .5); # line of best values
ax1.plot(df1['AIC'].argmin()+1, df1['AIC'].min(), marker='x', markersize=20, color = 'b'); # best val
ax1.set_xlabel('Number of predictors',fontsize=14);
ax1.set_ylabel('AIC',fontsize=14);
ax1.grid(color='grey', linestyle='-', linewidth=0.5);

# 2. BIC
ax2 = fig.add_subplot(1, nfig, 2)
ax2.scatter(df1['numb_features'],df1['BIC'], alpha = .5, color = 'darkblue');
ax2.plot(df1['numb_features'],df1['BIC'],color = 'r', alpha = .5); # line of best values
ax2.plot(df1['BIC'].argmin()+1, df1['BIC'].min(), marker='x', markersize=20, color = 'b'); # best val
ax2.set_xlabel('Number of predictors',fontsize=14);
ax2.set_ylabel('BIC',fontsize=14);
ax2.grid(color='grey', linestyle='-', linewidth=0.5);

# 3. R2_adj
ax3 = fig.add_subplot(1, nfig, 3)
ax3.scatter(df1['numb_features'],df1['R_squared_adj'], alpha = .5, color = 'darkblue');
ax3.plot(df1['numb_features'],df1['R_squared_adj'],color = 'r', alpha = .5); # line of best values
ax3.plot(df1['R_squared_adj'].argmax()+1, df1['R_squared_adj'].max(), marker='x', markersize=20, color = 'b'); # best val
ax3.set_xlabel('Number of predictors',fontsize=14);
ax3.set_ylabel(r'$R^2$ adj',fontsize=14);
ax3.grid(color='grey', linestyle='-', linewidth=0.5);

fig.suptitle('Best Subset Selection:\n Estimating test error by adjusting training error',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.8)
fig.subplots_adjust(wspace=0.275) # the amount of width reserved for blank space between subplots
plt.show();

Attempt n.2: estimating AIC, BIC and R^2 adjusted by our own

#Initializing useful variables
n = len(Y_train)
d = df_BestS_RSS_train['numb_features']
p = 8
RSS = df_BestS_RSS_train['RSS_train']
Rsquared = df_BestS_RSS_train['R_squared']

# Estimation of sigma^2: RSE of the general multiple Linear regression with p features
RSE = np.sqrt(min(RSS)/(n - p -1))  # min(RSS) is the RSS of the full model with p predictors
hat_sigma_squared = RSE**2

#Computing
AIC = (1/(n*hat_sigma_squared)) * (RSS + 2 * d * hat_sigma_squared )
BIC = (1/(n*hat_sigma_squared)) * (RSS +  np.log(n) * d * hat_sigma_squared )

ratio = 1 - Rsquared # RSS/TSS
R_squared_adj = 1 - ratio*(n-1)/(n-d-1)

#Store in DataFrame
df2 = pd.DataFrame({'numb_features': df_BestS_RSS_train['numb_features'],\
                    'RSS_train': df_BestS_RSS_train['RSS_train'],\
                    'features': df_BestS_RSS_train['features'],\
                    'AIC': AIC,\
                    'BIC': BIC,\
                    'R_squared_adj': R_squared_adj})

df2

	numb_features	RSS_train	features	AIC	BIC	R_squared_adj
0	1	44.528583	(lcavol,)	1.339802	1.372708	0.530401
8	2	37.091846	(lcavol, lweight)	1.150877	1.216689	0.602717
38	3	34.907749	(lcavol, lweight, svi)	1.116476	1.215193	0.620176
97	4	32.814995	(lcavol, lweight, lbph, svi)	1.084761	1.216385	0.637188
174	5	32.069447	(lcavol, lweight, lbph, svi, pgg45)	1.092680	1.257209	0.639618
229	6	30.539778	(lcavol, lweight, lbph, svi, lcp, pgg45)	1.077530	1.274965	0.651088
247	7	29.437300	(lcavol, lweight, age, lbph, svi, lcp, pgg45)	1.074948	1.305289	0.657983
254	8	29.426384	(lcavol, lweight, age, lbph, svi, lcp, gleason...	1.104478	1.367724	0.652215

# Initialize the figure
width = 6
height = 6
nfig = 3
fig = plt.figure(figsize = (width*nfig,height))

# 1. AIC
ax1 = fig.add_subplot(1, nfig, 1)
ax1.scatter(df2['numb_features'],df2['AIC'], alpha = .5, color = 'darkblue');
ax1.plot(df2['numb_features'],df2['AIC'],color = 'r', alpha = .5); # line of best values
ax1.plot(df2['AIC'].argmin()+1, df2['AIC'].min(), marker='x', markersize=20, color = 'b'); # best val
ax1.set_xlabel('Number of predictors',fontsize=14);
ax1.set_ylabel('AIC',fontsize=14);
ax1.grid(color='grey', linestyle='-', linewidth=0.5);

# 2. BIC
ax2 = fig.add_subplot(1, nfig, 2)
ax2.scatter(df2['numb_features'],df2['BIC'], alpha = .5, color = 'darkblue');
ax2.plot(df2['numb_features'],df2['BIC'],color = 'r', alpha = .5); # line of best values
ax2.plot(df2['BIC'].argmin()+1, df2['BIC'].min(), marker='x', markersize=20, color = 'b'); # best val
ax2.set_xlabel('Number of predictors',fontsize=14);
ax2.set_ylabel('BIC',fontsize=14);
ax2.grid(color='grey', linestyle='-', linewidth=0.5);

# 3. R2_adj
ax3 = fig.add_subplot(1, nfig, 3)
ax3.scatter(df2['numb_features'],df2['R_squared_adj'], alpha = .5, color = 'darkblue');
ax3.plot(df2['numb_features'],df2['R_squared_adj'],color = 'r', alpha = .5); # line of best values
ax3.plot(df2['R_squared_adj'].argmax()+1, df2['R_squared_adj'].max(), marker='x', markersize=20, color = 'b'); # best val
ax3.set_xlabel('Number of predictors',fontsize=14);
ax3.set_ylabel(r'$R^2$ adj',fontsize=14);
ax3.grid(color='grey', linestyle='-', linewidth=0.5);

fig.suptitle('Best Subset Selection:\n Estimating test error by adjusting training error',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.8)
fig.subplots_adjust(wspace=0.275) # the amount of width reserved for blank space between subplots
plt.show();

Forward selection #

For computational reasons, the best subset cannot be applied for any large n due to the \(2^n\) complexity.
Forward Stepwise begins with a model containing no predictors, and then adds predictors to the model, one at the time. At each step, the variable that gives the greatest additional improvement to the fit is added to the model.

7. Perform forward selection

Start from the empty model
Add at each step the variable that minimizes the RSS computed on test set (other performance measures can be used)

# to print some info:
flag = 1
detailed = 1

## range
variables = data.columns.tolist()
remaining_features, selected_features = variables.copy(), []

## Initialize the list where we temporarily store data
RSS_test_list, min_RSS_test_list, R_squared_list = [], [], []
numb_features, features_list = [], []

# Loop over the number of variables
for k in range(1,len(variables)+1):
    
    # store some info for each k
    all_RSS, all_R_squared, all_combs = [],[], []
    best_RSS = np.inf # initialize the best RSS as +inf

    # choose one variable in the remaining features
    for var in remaining_features:
        
        tmpComb = selected_features + [var]; # combination of variables
        
        # Compute all the statistics we need
        _, RSS_test, R_squared = LinReg(X_train[tmpComb], Y_train, X_test[tmpComb], Y_test) # we don't want RSS on training set
        
        # save temporary stats
        all_RSS.append(RSS_test)
        all_R_squared.append(R_squared)
        all_combs.append(tmpComb)

        # update if we reach a better RSS
        if RSS_test < best_RSS:
            best_RSS = RSS_test
            best_R_squared = R_squared
            best_feature = var
   
    # Print some information, before upgrading the features
    if flag == 1:
        features = [remaining_features,selected_features]
        params = [all_RSS,all_R_squared,all_combs]
        results = [best_feature,best_RSS]
        get_info_forwardS(k,features,params,results,detailed)
        
    # Save the statistics
    RSS_test_list.append(all_RSS)
    min_RSS_test_list.append(best_RSS)
    R_squared_list.append(best_R_squared)
    
    # Update variables for next loop
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)

    # Save features and number of features
    features_list.append(selected_features.copy())
    numb_features.append(len(selected_features))
    
# Store in DataFrame
df_ForwardS = pd.DataFrame({'numb_features': numb_features,\
                            'RSS_test' : RSS_test_list,\
                            'min_RSS_test': min_RSS_test_list,\
                            'R_squared': R_squared_list,\
                            'features': features_list})

==========================================================================================
Iter n.1:
Choose 1-length combinations of the remaining variables

Remaining features: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
Features selected: []

Comb n.1: ['lcavol']
------------------------------------------------------------------------------------------
RSS test: 14.392161587304827
R_squared: 0.5375164690552883

Comb n.2: ['lweight']
------------------------------------------------------------------------------------------
RSS test: 30.402845602615997
R_squared: 0.23543378299009432

Comb n.3: ['age']
------------------------------------------------------------------------------------------
RSS test: 33.846748424133
R_squared: 0.05182105437299367

Comb n.4: ['lbph']
------------------------------------------------------------------------------------------
RSS test: 35.29877101280492
R_squared: 0.06913619684911343

Comb n.5: ['svi']
------------------------------------------------------------------------------------------
RSS test: 20.632078139876853
R_squared: 0.3101224985902339

Comb n.6: ['lcp']
------------------------------------------------------------------------------------------
RSS test: 16.34576112489144
R_squared: 0.23931977441332264

Comb n.7: ['gleason']
------------------------------------------------------------------------------------------
RSS test: 25.529830407597565
R_squared: 0.11725680432657692

Comb n.8: ['pgg45']
------------------------------------------------------------------------------------------
RSS test: 28.61697167270323
R_squared: 0.20074696985742568

Selected variables: lcavol
min RSS: 14.392161587304827
==========================================================================================

==========================================================================================
Iter n.2:
Choose 2-length combinations of the remaining variables

Remaining features: ['lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
Features selected: ['lcavol']

Comb n.1: ['lcavol', 'lweight']
------------------------------------------------------------------------------------------
RSS test: 14.77447043041511
R_squared: 0.614756035022443

Comb n.2: ['lcavol', 'age']
------------------------------------------------------------------------------------------
RSS test: 14.454316401468965
R_squared: 0.53785859616379

Comb n.3: ['lcavol', 'lbph']
------------------------------------------------------------------------------------------
RSS test: 16.237234684619743
R_squared: 0.5846312407995875

Comb n.4: ['lcavol', 'svi']
------------------------------------------------------------------------------------------
RSS test: 11.58358360007023
R_squared: 0.5605323092792945

Comb n.5: ['lcavol', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 15.038931408454383
R_squared: 0.5381501805446671

Comb n.6: ['lcavol', 'gleason']
------------------------------------------------------------------------------------------
RSS test: 14.140851461764317
R_squared: 0.5386018755651162

Comb n.7: ['lcavol', 'pgg45']
------------------------------------------------------------------------------------------
RSS test: 13.827662568490943
R_squared: 0.5489982126930548

Selected variables: svi
min RSS: 11.58358360007023
==========================================================================================

==========================================================================================
Iter n.3:
Choose 3-length combinations of the remaining variables

Remaining features: ['lweight', 'age', 'lbph', 'lcp', 'gleason', 'pgg45']
Features selected: ['lcavol', 'svi']

Comb n.1: ['lcavol', 'svi', 'lweight']
------------------------------------------------------------------------------------------
RSS test: 12.015924403078804
R_squared: 0.6374405385171893

Comb n.2: ['lcavol', 'svi', 'age']
------------------------------------------------------------------------------------------
RSS test: 11.71416952499475
R_squared: 0.5612383365966047

Comb n.3: ['lcavol', 'svi', 'lbph']
------------------------------------------------------------------------------------------
RSS test: 14.353733928669755
R_squared: 0.6264131675413883

Comb n.4: ['lcavol', 'svi', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 13.285116624863331
R_squared: 0.5714253936028364

Comb n.5: ['lcavol', 'svi', 'gleason']
------------------------------------------------------------------------------------------
RSS test: 11.484037587414818
R_squared: 0.5610054092768177

Comb n.6: ['lcavol', 'svi', 'pgg45']
------------------------------------------------------------------------------------------
RSS test: 11.632246428034497
R_squared: 0.5651377944544718

Selected variables: gleason
min RSS: 11.484037587414818
==========================================================================================

==========================================================================================
Iter n.4:
Choose 4-length combinations of the remaining variables

Remaining features: ['lweight', 'age', 'lbph', 'lcp', 'pgg45']
Features selected: ['lcavol', 'svi', 'gleason']

Comb n.1: ['lcavol', 'svi', 'gleason', 'lweight']
------------------------------------------------------------------------------------------
RSS test: 11.95602065641986
R_squared: 0.6405674126734184

Comb n.2: ['lcavol', 'svi', 'gleason', 'age']
------------------------------------------------------------------------------------------
RSS test: 11.612572746859495
R_squared: 0.5614561297716232

Comb n.3: ['lcavol', 'svi', 'gleason', 'lbph']
------------------------------------------------------------------------------------------
RSS test: 14.355178421938225
R_squared: 0.6266591566646609

Comb n.4: ['lcavol', 'svi', 'gleason', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 13.185630584098526
R_squared: 0.5741698339447385

Comb n.5: ['lcavol', 'svi', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
RSS test: 11.919697961636555
R_squared: 0.5664871741058846

Selected variables: age
min RSS: 11.612572746859495
==========================================================================================

==========================================================================================
Iter n.5:
Choose 5-length combinations of the remaining variables

Remaining features: ['lweight', 'lbph', 'lcp', 'pgg45']
Features selected: ['lcavol', 'svi', 'gleason', 'age']

Comb n.1: ['lcavol', 'svi', 'gleason', 'age', 'lweight']
------------------------------------------------------------------------------------------
RSS test: 11.497691985782547
R_squared: 0.64510079253458

Comb n.2: ['lcavol', 'svi', 'gleason', 'age', 'lbph']
------------------------------------------------------------------------------------------
RSS test: 13.947298386201728
R_squared: 0.6294946547801643

Comb n.3: ['lcavol', 'svi', 'gleason', 'age', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 13.234367776364218
R_squared: 0.574264491209014

Comb n.4: ['lcavol', 'svi', 'gleason', 'age', 'pgg45']
------------------------------------------------------------------------------------------
RSS test: 12.141763999596828
R_squared: 0.5670143538267589

Selected variables: lweight
min RSS: 11.497691985782547
==========================================================================================

==========================================================================================
Iter n.6:
Choose 6-length combinations of the remaining variables

Remaining features: ['lbph', 'lcp', 'pgg45']
Features selected: ['lcavol', 'svi', 'gleason', 'age', 'lweight']

Comb n.1: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'lbph']
------------------------------------------------------------------------------------------
RSS test: 12.98960324749752
R_squared: 0.6696630706385877

Comb n.2: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 12.690514046312407
R_squared: 0.6563534839394491

Comb n.3: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'pgg45']
------------------------------------------------------------------------------------------
RSS test: 12.009380388381148
R_squared: 0.6505787510646934

Selected variables: pgg45
min RSS: 12.009380388381148
==========================================================================================

==========================================================================================
Iter n.7:
Choose 7-length combinations of the remaining variables

Remaining features: ['lbph', 'lcp']
Features selected: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'pgg45']

Comb n.1: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'pgg45', 'lbph']
------------------------------------------------------------------------------------------
RSS test: 13.834231490831279
R_squared: 0.6760051914465672

Comb n.2: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'pgg45', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 14.702111705571856
R_squared: 0.6720997902395782

Selected variables: lbph
min RSS: 13.834231490831279
==========================================================================================

==========================================================================================
Iter n.8:
Choose 8-length combinations of the remaining variables

Remaining features: ['lcp']
Features selected: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'pgg45', 'lbph']

Comb n.1: ['lcavol', 'svi', 'gleason', 'age', 'lweight', 'pgg45', 'lbph', 'lcp']
------------------------------------------------------------------------------------------
RSS test: 15.638220165228002
R_squared: 0.6943711796768238

Selected variables: lcp
min RSS: 15.638220165228002
==========================================================================================

# look at the result
df_ForwardS

	numb_features	RSS_test	min_RSS_test	R_squared	features
0	1	[14.392161587304827, 30.402845602615997, 33.84...	14.392162	0.537516	[lcavol]
1	2	[14.77447043041511, 14.454316401468965, 16.237...	11.583584	0.560532	[lcavol, svi]
2	3	[12.015924403078804, 11.71416952499475, 14.353...	11.484038	0.561005	[lcavol, svi, gleason]
3	4	[11.95602065641986, 11.612572746859495, 14.355...	11.612573	0.561456	[lcavol, svi, gleason, age]
4	5	[11.497691985782547, 13.947298386201728, 13.23...	11.497692	0.645101	[lcavol, svi, gleason, age, lweight]
5	6	[12.98960324749752, 12.690514046312407, 12.009...	12.009380	0.650579	[lcavol, svi, gleason, age, lweight, pgg45]
6	7	[13.834231490831279, 14.702111705571856]	13.834231	0.676005	[lcavol, svi, gleason, age, lweight, pgg45, lbph]
7	8	[15.638220165228002]	15.638220	0.694371	[lcavol, svi, gleason, age, lweight, pgg45, lb...

Plot for Forward Selection #

8. Generate a chart having:

- x-axis: the subset size
- y-axis: the RSS for the test set of the models generated at step 7

# Initialize the figure
width = 6
height = 6
nfig = 1
fig = plt.figure(figsize = (width*nfig,height))

# 1. RSS Test set plot
tmp_df = df_ForwardS;
ax = fig.add_subplot(1, nfig, 1)
for i in range(0,len(tmp_df.RSS_test)):
    ax.scatter([tmp_df.numb_features[i]]*(len(tmp_df.RSS_test[i])),tmp_df.RSS_test[i], alpha = .2, color = 'darkblue');
    
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('RSS on test set',fontsize=18);
ax.plot(tmp_df.numb_features,tmp_df.min_RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.legend();

fig.suptitle('Forward Subset Selection',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.8)
plt.show()

Backward selection #

Another alternative to best subset selection is Backward Stepwise Selection.
Backward Stepwise begins with a model containing all the predictors, and then removes predictors to the model, one at the time. At each step, the variable that gives the least improvement to the fit is removed to the model.

Backward selection requires that the number of samples \(n\) is larger than the number of variables \(p\). Instead, Forward selection can be used evene \(n<p\).

9. Perform backward selection

Start from the full model
Remove at each step the variable that minimizes the RSS (other performance measures can be used)

# a flag to print some info, values {0,1}
flag = 1     # for short info
detailed = 1

## range
variables = data.columns.tolist()         # excluding 'const'
remaining_features, dropped_features_list = variables.copy(), []

## Initialize the list where we temporarily store data
RSS_test_list, min_RSS_test_list, R_squared_list = [], [], []
numb_features, features_list = [], []

# run over the number of variables
for k in range(len(variables),0,-1):

    # initialization
    best_RSS = np.inf
    all_RSS, all_R_squared, all_combs = [],[], []

    for combo in itertools.combinations(remaining_features,k):

        # Compute the stats we need
        tmpComb = list(combo)
        _, RSS_test, R_squared = LinReg(X_train[tmpComb], Y_train, X_test[tmpComb], Y_test) # we don't want RSS on training set
        
        # store all the RSS
        all_RSS.append(RSS_test)
        all_R_squared.append(R_squared)
        all_combs.append(tmpComb)
        
        if RSS_test < best_RSS:
            best_RSS = RSS_test
            best_R_squared = R_squared
            dropped_list = list(set(remaining_features)-set(tmpComb))
 
    # Print some information, before upgrading the features
    if flag == 1:
        features = [remaining_features,dropped_features_list]
        params = [all_RSS,all_R_squared,all_combs]
        if dropped_list: # only if dropped_feature is not an empty list
            dropped_feature = dropped_list[0]
            results = [dropped_feature,best_RSS]
        else:
            results = [[],best_RSS]
            
        get_info_backwardS(k,features,params,results,detailed)
        
    # Updating variables for next loop
    if dropped_list: # only if dropped_feature is not an empty list
        dropped_feature = dropped_list[0]
        remaining_features.remove(dropped_feature)
        dropped_features_list.append(dropped_feature)
    else:
        dropped_features_list.append([]) # at the initial iteration we drop nothing!
        
    # Save stats
    min_RSS_test_list.append(best_RSS)
    RSS_test_list.append(all_RSS.copy())
    R_squared_list.append(best_R_squared.copy())

    # Save features and number of features
    numb_features.append(len(remaining_features))
    features_list.append(remaining_features.copy())
    
# Store in DataFrame
df_BackwardS = pd.DataFrame({'numb_features': numb_features,\
                             'RSS_test' : RSS_test_list,\
                             'min_RSS_test': min_RSS_test_list,\
                             'R_squared': R_squared_list,\
                             'dropped_feature': dropped_features_list,\
                             'features': features_list})

==========================================================================================
Iter n.0:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
Dropped features: []

Now we compare the model selecting 8 variables


Comb n.1: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: []
RSS test: 15.638220165228002


At the end we have:
min RSS: 15.638220165228002
We drop: []
==========================================================================================

==========================================================================================
Iter n.1:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
Dropped features: [[]]

Now we compare the model selecting 7 variables


Comb n.1: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['pgg45']
RSS test: 13.492898446056923


Comb n.2: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['gleason']
RSS test: 15.495404626758


Comb n.3: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcp']
RSS test: 13.83423149083128


Comb n.4: ['lcavol', 'lweight', 'age', 'lbph', 'lcp', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 17.516627850269806


Comb n.5: ['lcavol', 'lweight', 'age', 'svi', 'lcp', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lbph']
RSS test: 14.702111705571852


Comb n.6: ['lcavol', 'lweight', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['age']
RSS test: 17.03455209459629


Comb n.7: ['lcavol', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lweight']
RSS test: 16.754443499511755


Comb n.8: ['lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 22.63332934025175


At the end we have:
min RSS: 13.492898446056923
We drop: pgg45
==========================================================================================

==========================================================================================
Iter n.2:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason']
Dropped features: [[], 'pgg45']

Now we compare the model selecting 6 variables


Comb n.1: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp']
------------------------------------------------------------------------------------------
candidate dropped feature: ['gleason']
RSS test: 13.43573406046562


Comb n.2: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcp']
RSS test: 12.98960324749752


Comb n.3: ['lcavol', 'lweight', 'age', 'lbph', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 15.244637591703853


Comb n.4: ['lcavol', 'lweight', 'age', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lbph']
RSS test: 12.690514046312394


Comb n.5: ['lcavol', 'lweight', 'lbph', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['age']
RSS test: 14.300531209782973


Comb n.6: ['lcavol', 'age', 'lbph', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lweight']
RSS test: 14.364169507348384


Comb n.7: ['lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 20.7394303261079


At the end we have:
min RSS: 12.690514046312394
We drop: lbph
==========================================================================================

==========================================================================================
Iter n.3:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'svi', 'lcp', 'gleason']
Dropped features: [[], 'pgg45', 'lbph']

Now we compare the model selecting 5 variables


Comb n.1: ['lcavol', 'lweight', 'age', 'svi', 'lcp']
------------------------------------------------------------------------------------------
candidate dropped feature: ['gleason']
RSS test: 12.70021855056121


Comb n.2: ['lcavol', 'lweight', 'age', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcp']
RSS test: 11.497691985782552


Comb n.3: ['lcavol', 'lweight', 'age', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 14.720875322787679


Comb n.4: ['lcavol', 'lweight', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['age']
RSS test: 13.183227370610318


Comb n.5: ['lcavol', 'age', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lweight']
RSS test: 13.23436777636421


Comb n.6: ['lweight', 'age', 'svi', 'lcp', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 16.98150806110247


At the end we have:
min RSS: 11.497691985782552
We drop: lcp
==========================================================================================

==========================================================================================
Iter n.4:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'svi', 'gleason']
Dropped features: [[], 'pgg45', 'lbph', 'lcp']

Now we compare the model selecting 4 variables


Comb n.1: ['lcavol', 'lweight', 'age', 'svi']
------------------------------------------------------------------------------------------
candidate dropped feature: ['gleason']
RSS test: 11.6363032699816


Comb n.2: ['lcavol', 'lweight', 'age', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 14.030539349222018


Comb n.3: ['lcavol', 'lweight', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['age']
RSS test: 11.956020656419863


Comb n.4: ['lcavol', 'age', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lweight']
RSS test: 11.612572746859495


Comb n.5: ['lweight', 'age', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 18.093182817451503


At the end we have:
min RSS: 11.612572746859495
We drop: lweight
==========================================================================================

==========================================================================================
Iter n.5:

At the beginning we have:
Remaining features: ['lcavol', 'age', 'svi', 'gleason']
Dropped features: [[], 'pgg45', 'lbph', 'lcp', 'lweight']

Now we compare the model selecting 3 variables


Comb n.1: ['lcavol', 'age', 'svi']
------------------------------------------------------------------------------------------
candidate dropped feature: ['gleason']
RSS test: 11.71416952499475


Comb n.2: ['lcavol', 'age', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 14.18793001209106


Comb n.3: ['lcavol', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['age']
RSS test: 11.484037587414818


Comb n.4: ['age', 'svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 19.78197534246865


At the end we have:
min RSS: 11.484037587414818
We drop: age
==========================================================================================

==========================================================================================
Iter n.6:

At the beginning we have:
Remaining features: ['lcavol', 'svi', 'gleason']
Dropped features: [[], 'pgg45', 'lbph', 'lcp', 'lweight', 'age']

Now we compare the model selecting 2 variables


Comb n.1: ['lcavol', 'svi']
------------------------------------------------------------------------------------------
candidate dropped feature: ['gleason']
RSS test: 11.58358360007023


Comb n.2: ['lcavol', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 14.140851461764317


Comb n.3: ['svi', 'gleason']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 18.710131293690928


At the end we have:
min RSS: 11.58358360007023
We drop: gleason
==========================================================================================

==========================================================================================
Iter n.7:

At the beginning we have:
Remaining features: ['lcavol', 'svi']
Dropped features: [[], 'pgg45', 'lbph', 'lcp', 'lweight', 'age', 'gleason']

Now we compare the model selecting 1 variables


Comb n.1: ['lcavol']
------------------------------------------------------------------------------------------
candidate dropped feature: ['svi']
RSS test: 14.392161587304827


Comb n.2: ['svi']
------------------------------------------------------------------------------------------
candidate dropped feature: ['lcavol']
RSS test: 20.632078139876853


At the end we have:
min RSS: 14.392161587304827
We drop: svi
==========================================================================================

# look at the result
df_BackwardS

	numb_features	RSS_test	min_RSS_test	R_squared	dropped_feature	features
0	8	[15.638220165228002]	15.638220	0.694371	[]	[lcavol, lweight, age, lbph, svi, lcp, gleason...
1	7	[13.492898446056923, 15.495404626758, 13.83423...	13.492898	0.678457	pgg45	[lcavol, lweight, age, lbph, svi, lcp, gleason]
2	6	[13.43573406046562, 12.98960324749752, 15.2446...	12.690514	0.656353	lbph	[lcavol, lweight, age, svi, lcp, gleason]
3	5	[12.70021855056121, 11.497691985782552, 14.720...	11.497692	0.645101	lcp	[lcavol, lweight, age, svi, gleason]
4	4	[11.6363032699816, 14.030539349222018, 11.9560...	11.612573	0.561456	lweight	[lcavol, age, svi, gleason]
5	3	[11.71416952499475, 14.18793001209106, 11.4840...	11.484038	0.561005	age	[lcavol, svi, gleason]
6	2	[11.58358360007023, 14.140851461764317, 18.710...	11.583584	0.560532	gleason	[lcavol, svi]
7	1	[14.392161587304827, 20.632078139876853]	14.392162	0.537516	svi	[lcavol]

Plot for Backward Selection #

# Initialize the figure
width = 6
height = 6
nfig = 1
fig = plt.figure(figsize = (width*nfig,height))

# 1. RSS Test set plot
tmp_df = df_BackwardS;
ax = fig.add_subplot(1, nfig, 1)
for i in range(0,len(tmp_df.RSS_test)):
    ax.scatter([tmp_df.numb_features[i]]*(len(tmp_df.RSS_test[i])),tmp_df.RSS_test[i], alpha = .2, color = 'darkblue');
    
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('RSS on test set',fontsize=18);
ax.plot(tmp_df.numb_features,tmp_df.min_RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.legend();

fig.suptitle('Backward Subset Selection',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.8)
plt.show()

Comparison subset selection methods #

Comparison between Best, Forward and Backward Selction results on test set

# Initialize the figure
width = 6
height = 6
nfig = 3
fig = plt.figure(figsize = (width*nfig,height))

# 1. BEST SUBSET SELECTION
tmp_df1 = df_BestS;           # scatter plot
tmp_df2 = df_BestS_RSS_test; # plot the line of best values
ax = fig.add_subplot(1, nfig, 1)
ax.scatter(tmp_df1.numb_features,tmp_df1.RSS_test, alpha = .2, color = 'darkblue');
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('Best Selection',fontsize=18);
ax.plot(tmp_df2.numb_features,tmp_df2.RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.legend();

# 2. FORWARD SUBSET SELECTION
tmp_df = df_ForwardS;
ax = fig.add_subplot(1, nfig, 2)
for i in range(0,len(tmp_df.RSS_test)):
    ax.scatter([tmp_df.numb_features[i]]*(len(tmp_df.RSS_test[i])),tmp_df.RSS_test[i], alpha = .2, color = 'darkblue');
    
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('Forward Selection',fontsize=18);
ax.plot(tmp_df.numb_features,tmp_df.min_RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.legend();

# 3. BACKWARD SUBSET SELECTION
tmp_df = df_BackwardS;
ax = fig.add_subplot(1, nfig, 3)
for i in range(0,len(tmp_df.RSS_test)):
    ax.scatter([tmp_df.numb_features[i]]*(len(tmp_df.RSS_test[i])),tmp_df.RSS_test[i], alpha = .2, color = 'darkblue');
    
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('Backward Selection',fontsize=18);
ax.plot(tmp_df.numb_features,tmp_df.min_RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.legend();

fig.suptitle('Comparison Subset Selection',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.8)
plt.show()

# Initialize the figure
width = 6
height = 6
nfig = 1
fig = plt.figure(figsize = (width*nfig,height))
ax = fig.add_subplot(1, nfig, 1)

# Best Selection
ax.plot(df_BestS_RSS_test.numb_features,df_BestS_RSS_test.RSS_test,color = 'r', label = 'Best subset'); # line of best values

# Forward Selection
ax.plot(df_ForwardS.numb_features,df_ForwardS.min_RSS_test,color = 'b', label = 'Best subset'); # line of best values

# Backward Selection
ax.plot(df_BackwardS.numb_features,df_BackwardS.min_RSS_test,color = 'g', label = 'Best subset'); # line of best values

ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('Comparison minimum RSS',fontsize=18);
ax.legend(['best','forward','backward'])
    
plt.show();

Best method from Subset Selection

We see below the results of Best, Forward and Backward Subset Selection.

df_range = [df_BestS_RSS_test, df_ForwardS, df_BackwardS]
columns_range = ['RSS_test','min_RSS_test','min_RSS_test']
methods = ['Best Selection', 'Forward Selection', 'Backward Selection']

for df, col, meth in zip(df_range,columns_range,methods):
    
    idx = df[col].idxmin()

    print(f"\nFor {meth} the best method has:\n\
n. features: {df['numb_features'][idx]}\n\
features: {df['features'][idx]}\n\
RSS test: {df[col][idx]}\n")

For Best Selection the best method has:
n. features: 3
features: ('lcavol', 'svi', 'gleason')
RSS test: 11.484037587414818


For Forward Selection the best method has:
n. features: 3
features: ['lcavol', 'svi', 'gleason']
RSS test: 11.484037587414818


For Backward Selection the best method has:
n. features: 3
features: ['lcavol', 'svi', 'gleason']
RSS test: 11.484037587414818

Backward selection with Z-score #

11. Perform backward selection using the z-score as a statistics for selecting the predictor to drop

* Start from the full model
* Remove at each step the variable having the smallest Z-score (which library is more suitable for this purpose?)

# a flag to print some info, values {0,1}
flag = 1     # for short info

## range
variables = data.columns.tolist()         # excluding 'const'
remaining_features = variables.copy()
tmpComb = remaining_features.copy()
dropped_features_list= []

## Initialize the list where we temporarily store data
RSS_test_list, R_squared_list = [], []
numb_features, features_list = [], []

# Loop over the number of variables
for k in range(len(variables),0,-1):

    # Compute the stats we need
    Zscores, minZ, idx_minZ = Zscore(X_train[tmpComb],Y_train)
    _, RSS_test, R_squared = LinReg(X_train[tmpComb], Y_train, X_test[tmpComb], Y_test)
  
    # Save stats
    RSS_test_list.append(RSS_test)
    R_squared_list.append(best_R_squared.copy())
    
    # Print some information, before upgrading the features
    if flag == 1:
        features = [remaining_features,dropped_features_list]
        params = [RSS_test,R_squared_list,tmpComb]
        if dropped_list: # only if dropped_feature is not an empty list
            dropped_feature = dropped_list[0]
            results = [idx_minZ,RSS_test]
        else:
            results = [RSS_test,[]]
            
        get_info_backwardS_Zscore(k,features,params,results,detailed)
        
    # Save features and number of features
    numb_features.append(k)
    features_list.append(tmpComb.copy())
    
    # update combinations
    tmpComb.remove(idx_minZ)
    dropped_list = list(set(remaining_features)-set(tmpComb))

    # Updating variables for next loop
    if dropped_list: # only if dropped_feature is not an empty list
        dropped_feature = dropped_list[0]
        remaining_features.remove(dropped_feature)
        dropped_features_list.append(dropped_feature)
    else:
        dropped_features_list.append([]) # at the initial iteration we drop nothing!
 
       
# Store in DataFrame
df_BackwardS_minZ = pd.DataFrame({'numb_features': numb_features,\
                             'RSS_test' : RSS_test_list,\
                             'R_squared': R_squared_list,\
                             'dropped_feature': dropped_features_list,\
                             'features': features_list})

==========================================================================================
Iter n.0:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
Dropped features: []


The Z-scores are:
 lcavol     5.366290
lweight    2.750789
age       -1.395909
lbph       2.055846
svi        2.469255
lcp       -1.866913
gleason   -0.146681
pgg45      1.737840
dtype: float64


At the end we have:
min RSS: 15.638220165228002
We drop: gleason
==========================================================================================

==========================================================================================
Iter n.1:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'pgg45']
Dropped features: ['gleason']


The Z-scores are:
 lcavol     5.462426
lweight    2.833132
age       -1.486490
lbph       2.068796
svi        2.519204
lcp       -1.877253
pgg45      2.182013
dtype: float64


At the end we have:
min RSS: 15.495404626758
We drop: age
==========================================================================================

==========================================================================================
Iter n.2:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'lbph', 'svi', 'lcp', 'pgg45']
Dropped features: ['gleason', 'age']


The Z-scores are:
 lcavol     5.243670
lweight    2.589758
lbph       1.815545
svi        2.544601
lcp       -1.733570
pgg45      1.871654
dtype: float64


At the end we have:
min RSS: 16.457800398803407
We drop: lcp
==========================================================================================

==========================================================================================
Iter n.3:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'lbph', 'svi', 'pgg45']
Dropped features: ['gleason', 'age', 'lcp']


The Z-scores are:
 lcavol     4.899985
lweight    2.551974
lbph       1.952740
svi        2.039752
pgg45      1.190849
dtype: float64


At the end we have:
min RSS: 14.577726321419373
We drop: pgg45
==========================================================================================

==========================================================================================
Iter n.4:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'lbph', 'svi']
Dropped features: ['gleason', 'age', 'lcp', 'pgg45']


The Z-scores are:
 lcavol     5.461353
lweight    2.441316
lbph       1.988469
svi        2.458931
dtype: float64


At the end we have:
min RSS: 13.689963661204882
We drop: lbph
==========================================================================================

==========================================================================================
Iter n.5:

At the beginning we have:
Remaining features: ['lcavol', 'lweight', 'svi']
Dropped features: ['gleason', 'age', 'lcp', 'pgg45', 'lbph']


The Z-scores are:
 lcavol     5.507417
lweight    3.655671
svi        1.985388
dtype: float64


At the end we have:
min RSS: 12.015924403078804
We drop: svi
==========================================================================================

==========================================================================================
Iter n.6:

At the beginning we have:
Remaining features: ['lcavol', 'lweight']
Dropped features: ['gleason', 'age', 'lcp', 'pgg45', 'lbph', 'svi']


The Z-scores are:
 lcavol     7.938277
lweight    3.582135
dtype: float64


At the end we have:
min RSS: 14.77447043041511
We drop: lweight
==========================================================================================

==========================================================================================
Iter n.7:

At the beginning we have:
Remaining features: ['lcavol']
Dropped features: ['gleason', 'age', 'lcp', 'pgg45', 'lbph', 'svi', 'lweight']


The Z-scores are:
 lcavol    8.691694
dtype: float64


At the end we have:
min RSS: 14.392161587304827
We drop: lcavol
==========================================================================================

df_BackwardS_minZ

	numb_features	RSS_test	R_squared	dropped_feature	features
0	8	15.638220	0.537516	gleason	[lcavol, lweight, age, lbph, svi, lcp, gleason...
1	7	15.495405	0.537516	age	[lcavol, lweight, age, lbph, svi, lcp, pgg45]
2	6	16.457800	0.537516	lcp	[lcavol, lweight, lbph, svi, lcp, pgg45]
3	5	14.577726	0.537516	pgg45	[lcavol, lweight, lbph, svi, pgg45]
4	4	13.689964	0.537516	lbph	[lcavol, lweight, lbph, svi]
5	3	12.015924	0.537516	svi	[lcavol, lweight, svi]
6	2	14.774470	0.537516	lweight	[lcavol, lweight]
7	1	14.392162	0.537516	lcavol	[lcavol]

12. Generate a chart having

    * x-axis: the subset size
    * y-axis: the RSS for the test set of the models generated at step 11
* Compare it with the chart generated at point 10.

# Initialize the figure
width = 6
height = 6
nfig = 1
fig = plt.figure(figsize = (width*nfig,height))

# 1. RSS Test set plot
tmp_df = df_BackwardS_minZ;
ax = fig.add_subplot(1, nfig, 1)
ax.scatter(tmp_df.numb_features,tmp_df.RSS_test, alpha = .2, color = 'darkblue');
    
ax.set_xlabel('Subset Size k',fontsize=14);
ax.set_ylabel('RSS',fontsize=14);
ax.set_title('RSS on test set',fontsize=18);
ax.plot(tmp_df.numb_features,tmp_df.RSS_test,color = 'r', label = 'Best subset'); # line of best values
ax.grid(color='grey', linestyle='-', linewidth=0.5);
ax.legend();

fig.suptitle('Backward Subset Selection',fontsize=25, y=0.98);
fig.subplots_adjust(top=0.85)
plt.show()

Comparison with Z-score #

Now compare the results by choosing the best model not by minimizing RSS but maximizing Zscore.

df_range = [df_BestS_RSS_test, df_ForwardS, df_BackwardS, df_BackwardS_minZ]
columns_range = ['RSS_test','min_RSS_test','min_RSS_test','RSS_test']
methods = ['Best Selection - RSS test', 'Forward Selection - RSS test', 'Backward Selection - RSS test', 'Backward Selection - Z score']

for df, col, meth in zip(df_range,columns_range,methods):
    
    idx = df[col].idxmin()

    print(f"\nFor {meth}, the best method has:\n\
n. features: {df['numb_features'][idx]}\n\
features: {df['features'][idx]}\n\
RSS test: {df[col][idx]}\n")

For Best Selection - RSS test, the best method has:
n. features: 3
features: ('lcavol', 'svi', 'gleason')
RSS test: 11.484037587414818


For Forward Selection - RSS test, the best method has:
n. features: 3
features: ['lcavol', 'svi', 'gleason']
RSS test: 11.484037587414818


For Backward Selection - RSS test, the best method has:
n. features: 3
features: ['lcavol', 'svi', 'gleason']
RSS test: 11.484037587414818


For Backward Selection - Z score, the best method has:
n. features: 3
features: ['lcavol', 'lweight', 'svi']
RSS test: 12.015924403078804