from google.colab import data_table
# Enabling an interactive DataFrame display data_table.enable_dataframe_formatter()
Lasso Linear Regression
Online Shopping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale # zero mean & one s.d.
from sklearn.linear_model import LassoCV, lasso_path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
= pd.read_csv("https://bcdanl.github.io/data/browser-online-shopping.zip")
df df
Warning: Total number of columns (1001) exceeds max_columns (20). Falling back to pandas display.
spend | atdmt.com | yahoo.com | whenu.com | weatherbug.com | msn.com | google.com | aol.com | questionmarket.com | googlesyndication.com-o02 | ... | ugo.com | cox.com | spicymint.com | real.com-o01 | targetnet.com | effectivebrand.com | dallascowboys.com | leadgenetwork.com | in.us | vistaprint.com | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 424 | 4.052026 | 11.855928 | 0.000000 | 0.000000 | 0.250125 | 6.528264 | 0.150075 | 1.350675 | 3.401701 | ... | 0.0 | 0.025013 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
1 | 2335 | 4.448743 | 8.446164 | 0.000000 | 0.000000 | 0.644745 | 0.451322 | 0.128949 | 0.967118 | 1.225016 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
2 | 279 | 7.678815 | 0.487234 | 0.000000 | 31.904112 | 13.213798 | 0.954980 | 0.000000 | 2.124342 | 2.514130 | ... | 0.0 | 0.000000 | 0.000000 | 0.019489 | 0.019489 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
3 | 829 | 13.547802 | 18.509289 | 0.045310 | 0.045310 | 0.294517 | 1.110104 | 0.067966 | 3.194382 | 3.149071 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.067966 |
4 | 221 | 2.879581 | 10.558464 | 0.000000 | 0.000000 | 3.606748 | 1.396161 | 0.000000 | 0.727167 | 0.988947 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 102 | 6.040454 | 26.350790 | 0.000000 | 0.000000 | 0.000000 | 0.055417 | 4.710446 | 2.909393 | 0.554170 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
9996 | 5096 | 8.044292 | 3.012539 | 9.102752 | 0.032568 | 1.612115 | 1.840091 | 9.037616 | 3.289367 | 1.009608 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
9997 | 883 | 7.053942 | 1.659751 | 0.000000 | 0.000000 | 9.543568 | 0.414938 | 13.692946 | 2.074689 | 2.074689 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
9998 | 256 | 10.408043 | 0.473093 | 0.000000 | 0.000000 | 18.568894 | 0.887049 | 3.370787 | 3.548196 | 0.236546 | ... | 0.0 | 0.532229 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
9999 | 6 | 2.937832 | 11.789234 | 0.018954 | 7.827900 | 0.113723 | 2.710387 | 0.151630 | 0.777104 | 0.398029 | ... | 0.0 | 0.000000 | 0.037908 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
10000 rows × 1001 columns
= df.drop('spend', axis = 1)
X = df['spend']
y
# Train-test split
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
= X_train.values
X_train_np = X_test.values
X_test_np = y_train.values
y_train_np = y_test.values y_test_np
# LassoCV with a range of alpha values
= LassoCV(n_alphas = 100, # default is 100
lasso_cv = None, # alphas=None automatically generate 100 candidate alpha values
alphas = 5,
cv =42,
random_state=100000)
max_iter
lasso_cv.fit(X_train.values, np.log(y_train.values))
print("LassoCV - Best alpha:", lasso_cv.alpha_)
# Create a DataFrame including the intercept and the coefficients:
= pd.DataFrame({
coef_lasso 'predictor': list(X_train.columns),
'coefficient': list(lasso_cv.coef_),
'exp_coefficient': np.exp( list(lasso_cv.coef_) )
})
# Evaluate
= lasso_cv.predict(X_test.values)
y_pred_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_lasso print("LassoCV - MSE:", mse_lasso)
LassoCV - Best alpha: 0.004098154620620373
LassoCV - MSE: 88099012.81854005
# LassoCV with a range of alpha values
= LassoCV(n_alphas = 100, # default is 100
lasso_cv = None, # alphas=None automatically generate 100 candidate alpha values
alphas = 5,
cv =42,
random_state=100000)
max_iter
lasso_cv.fit(X_train_np, np.log(y_train_np))
print("LassoCV - Best alpha:", lasso_cv.alpha_)
# Create a DataFrame including the intercept and the coefficients:
= pd.DataFrame({
coef_lasso 'predictor': list(X_train.columns),
'coefficient': list(lasso_cv.coef_),
'exp_coefficient': np.exp( list(lasso_cv.coef_) )
})
# Evaluate
= lasso_cv.predict(X_test_np)
y_pred_lasso = mean_squared_error(y_test_np, y_pred_lasso)
mse_lasso print("LassoCV - MSE:", mse_lasso)
LassoCV - Best alpha: 0.004098154620620373
LassoCV - MSE: 88099012.81854005
= coef_lasso.query('coefficient != 0')
coef_lasso 0] coef_lasso.shape[
307
'coefficient', ascending = False) coef_lasso.sort_values(
predictor | coefficient | exp_coefficient | |
---|---|---|---|
895 | bizrate.com-o01 | 1.388699 | 4.009628 |
770 | staples.com | 0.757120 | 2.132127 |
690 | travelhook.net | 0.688146 | 1.990022 |
843 | united.com | 0.610771 | 1.841850 |
506 | victoriassecret.com | 0.584030 | 1.793251 |
... | ... | ... | ... |
279 | new.net | -0.181475 | 0.834039 |
374 | coolsavings.com | -0.187875 | 0.828718 |
851 | rsc03.net | -0.231599 | 0.793264 |
443 | checkm8.com | -0.240327 | 0.786371 |
481 | macromedia.com | -0.243739 | 0.783693 |
307 rows × 3 columns
# Compute the mean and standard deviation of the CV errors for each alpha.
= np.mean(lasso_cv.mse_path_, axis=1)
mean_cv_errors = np.std(lasso_cv.mse_path_, axis=1)
std_cv_errors
=(8, 6))
plt.figure(figsize=std_cv_errors, marker='o', linestyle='-', capsize=5)
plt.errorbar(lasso_cv.alphas_, mean_cv_errors, yerr'log')
plt.xscale('Alpha')
plt.xlabel('Mean CV Error (MSE)')
plt.ylabel('Cross-Validation Error vs. Alpha')
plt.title(#plt.gca().invert_xaxis() # Optionally invert the x-axis so lower alphas (less regularization) appear to the right.
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x
plt.legend() plt.show()
# Compute the coefficient path over the alpha grid that LassoCV used
= lasso_path(X_train, np.log(y_train),
alphas, coefs, _ =lasso_cv.alphas_,
alphas=100000)
max_iter
# Count nonzero coefficients for each alpha (coefs shape: (n_features, n_alphas))
= np.sum(coefs != 0, axis=0)
nonzero_counts
# Plot the number of nonzero coefficients versus alpha
=(8,6))
plt.figure(figsize='o', linestyle='-')
plt.plot(alphas, nonzero_counts, marker'log')
plt.xscale('Alpha')
plt.xlabel('Number of nonzero coefficients')
plt.ylabel('Nonzero Coefficients vs. Alpha')
plt.title(#plt.gca().invert_xaxis() # Lower alphas (less regularization) on the right
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x
plt.legend() plt.show()
# Compute the lasso path. Note: we use np.log(y_train) because that's what you used in LassoCV.
= lasso_path(X_train.values, np.log(y_train.values), alphas=lasso_cv.alphas_, max_iter=100000)
alphas, coefs, _
=(8, 6))
plt.figure(figsize# Iterate over each predictor and plot its coefficient path.
for i, col in enumerate(X_train.columns):
=col)
plt.plot(alphas, coefs[i, :], label
'log')
plt.xscale('Alpha')
plt.xlabel('Coefficient value')
plt.ylabel('Lasso Coefficient Paths')
plt.title(#plt.gca().invert_xaxis() # Lower alphas (weaker regularization) to the right.
#plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x#plt.legend()
plt.show()
# RidgeCV with a list of alpha values and 3-fold cross-validation
from sklearn.linear_model import RidgeCV, Ridge
= 10
alpha_max = 1e-4
alpha_min_ratio = alpha_max * alpha_min_ratio
alpha_min
# Define candidate alpha values
= np.logspace(np.log(alpha_max), np.log(alpha_min), num=5)
alphas alphas
array([2.00717432e+02, 1.00000000e+00, 4.98212830e-03, 2.48216024e-05,
1.23664407e-07])
= RidgeCV(alphas=alphas, cv=3, scoring='neg_mean_squared_error')
ridge_cv
ridge_cv.fit(X_train_np, np.log(y_train_np))
print("RidgeCV - Best alpha:", ridge_cv.alpha_)
# Create a DataFrame including the intercept and the coefficients:
= pd.DataFrame({
coef_ridge 'predictor': list(X_train.columns),
'coefficient': list(ridge_cv.coef_),
'exp_coefficient': np.exp( list(ridge_cv.coef_) )
})
# Evaluate
= ridge_cv.predict(X_test_np)
y_pred_ridge = mean_squared_error(y_test_np, y_pred_ridge)
mse_ridge print("RidgeCV - MSE:", mse_ridge)
RidgeCV - Best alpha: 200.71743249053017
RidgeCV - MSE: 88099136.93994579
coef_ridge
predictor | coefficient | exp_coefficient | |
---|---|---|---|
0 | atdmt.com | -0.004977 | 0.995035 |
1 | yahoo.com | -0.017194 | 0.982953 |
2 | whenu.com | -0.008336 | 0.991699 |
3 | weatherbug.com | -0.005399 | 0.994616 |
4 | msn.com | -0.004900 | 0.995112 |
... | ... | ... | ... |
995 | effectivebrand.com | -0.098469 | 0.906223 |
996 | dallascowboys.com | -0.039989 | 0.960800 |
997 | leadgenetwork.com | 0.009274 | 1.009317 |
998 | in.us | -0.033527 | 0.967029 |
999 | vistaprint.com | 0.066156 | 1.068394 |
1000 rows × 3 columns
'coefficient != 0').shape[0] coef_ridge.query(
1000
= Ridge()
ridge = []
coefs
for a in alphas:
=a)
ridge.set_params(alpha
ridge.fit(X_train_np, y_train_np)
coefs.append(ridge.coef_)
= plt.gca()
ax
ax.plot(alphas, coefs)'log')
ax.set_xscale(#ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
'tight')
plt.axis('alpha')
plt.xlabel('weights')
plt.ylabel('Ridge coefficients as a function of the regularization'); plt.title(