Lasso Linear Regression

Online Shopping

Author

Byeong-Hak Choe

Published

April 2, 2025

Modified

April 19, 2025

from google.colab import data_table
data_table.enable_dataframe_formatter()  # Enabling an interactive DataFrame display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale # zero mean & one s.d.
from sklearn.linear_model import LassoCV, lasso_path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df = pd.read_csv("https://bcdanl.github.io/data/browser-online-shopping.zip")
df

Warning: Total number of columns (1001) exceeds max_columns (20). Falling back to pandas display.

	spend	atdmt.com	yahoo.com	whenu.com	weatherbug.com	msn.com	google.com	aol.com	questionmarket.com	googlesyndication.com-o02	...	ugo.com	cox.com	spicymint.com	real.com-o01	targetnet.com	effectivebrand.com	dallascowboys.com	leadgenetwork.com	in.us	vistaprint.com
0	424	4.052026	11.855928	0.000000	0.000000	0.250125	6.528264	0.150075	1.350675	3.401701	...	0.0	0.025013	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
1	2335	4.448743	8.446164	0.000000	0.000000	0.644745	0.451322	0.128949	0.967118	1.225016	...	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
2	279	7.678815	0.487234	0.000000	31.904112	13.213798	0.954980	0.000000	2.124342	2.514130	...	0.0	0.000000	0.000000	0.019489	0.019489	0.0	0.0	0.0	0.0	0.000000
3	829	13.547802	18.509289	0.045310	0.045310	0.294517	1.110104	0.067966	3.194382	3.149071	...	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.067966
4	221	2.879581	10.558464	0.000000	0.000000	3.606748	1.396161	0.000000	0.727167	0.988947	...	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	102	6.040454	26.350790	0.000000	0.000000	0.000000	0.055417	4.710446	2.909393	0.554170	...	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
9996	5096	8.044292	3.012539	9.102752	0.032568	1.612115	1.840091	9.037616	3.289367	1.009608	...	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
9997	883	7.053942	1.659751	0.000000	0.000000	9.543568	0.414938	13.692946	2.074689	2.074689	...	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
9998	256	10.408043	0.473093	0.000000	0.000000	18.568894	0.887049	3.370787	3.548196	0.236546	...	0.0	0.532229	0.000000	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000
9999	6	2.937832	11.789234	0.018954	7.827900	0.113723	2.710387	0.151630	0.777104	0.398029	...	0.0	0.000000	0.037908	0.000000	0.000000	0.0	0.0	0.0	0.0	0.000000

10000 rows × 1001 columns

X = df.drop('spend', axis = 1)
y = df['spend']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values

# LassoCV with a range of alpha values
lasso_cv = LassoCV(n_alphas = 100, # default is 100
                   alphas = None, # alphas=None automatically generate 100 candidate alpha values
                   cv = 5,
                   random_state=42,
                   max_iter=100000)
lasso_cv.fit(X_train.values, np.log(y_train.values))

print("LassoCV - Best alpha:", lasso_cv.alpha_)


# Create a DataFrame including the intercept and the coefficients:
coef_lasso = pd.DataFrame({
    'predictor': list(X_train.columns),
    'coefficient':  list(lasso_cv.coef_),
    'exp_coefficient': np.exp(  list(lasso_cv.coef_) )
})


# Evaluate
y_pred_lasso = lasso_cv.predict(X_test.values)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print("LassoCV - MSE:", mse_lasso)

LassoCV - Best alpha: 0.004098154620620373
LassoCV - MSE: 88099012.81854005

# LassoCV with a range of alpha values
lasso_cv = LassoCV(n_alphas = 100, # default is 100
                   alphas = None, # alphas=None automatically generate 100 candidate alpha values
                   cv = 5,
                   random_state=42,
                   max_iter=100000)
lasso_cv.fit(X_train_np, np.log(y_train_np))

print("LassoCV - Best alpha:", lasso_cv.alpha_)


# Create a DataFrame including the intercept and the coefficients:
coef_lasso = pd.DataFrame({
    'predictor': list(X_train.columns),
    'coefficient':  list(lasso_cv.coef_),
    'exp_coefficient': np.exp(  list(lasso_cv.coef_) )
})


# Evaluate
y_pred_lasso = lasso_cv.predict(X_test_np)
mse_lasso = mean_squared_error(y_test_np, y_pred_lasso)
print("LassoCV - MSE:", mse_lasso)

LassoCV - Best alpha: 0.004098154620620373
LassoCV - MSE: 88099012.81854005

coef_lasso = coef_lasso.query('coefficient != 0')
coef_lasso.shape[0]

coef_lasso.sort_values('coefficient', ascending = False)

	predictor	coefficient	exp_coefficient
895	bizrate.com-o01	1.388699	4.009628
770	staples.com	0.757120	2.132127
690	travelhook.net	0.688146	1.990022
843	united.com	0.610771	1.841850
506	victoriassecret.com	0.584030	1.793251
...	...	...	...
279	new.net	-0.181475	0.834039
374	coolsavings.com	-0.187875	0.828718
851	rsc03.net	-0.231599	0.793264
443	checkm8.com	-0.240327	0.786371
481	macromedia.com	-0.243739	0.783693

307 rows × 3 columns

# Compute the mean and standard deviation of the CV errors for each alpha.
mean_cv_errors = np.mean(lasso_cv.mse_path_, axis=1)
std_cv_errors = np.std(lasso_cv.mse_path_, axis=1)

plt.figure(figsize=(8, 6))
plt.errorbar(lasso_cv.alphas_, mean_cv_errors, yerr=std_cv_errors, marker='o', linestyle='-', capsize=5)
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Mean CV Error (MSE)')
plt.title('Cross-Validation Error vs. Alpha')
#plt.gca().invert_xaxis()  # Optionally invert the x-axis so lower alphas (less regularization) appear to the right.
plt.axvline(x=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.legend()
plt.show()

# Compute the coefficient path over the alpha grid that LassoCV used
alphas, coefs, _ = lasso_path(X_train, np.log(y_train),
                              alphas=lasso_cv.alphas_,
                              max_iter=100000)

# Count nonzero coefficients for each alpha (coefs shape: (n_features, n_alphas))
nonzero_counts = np.sum(coefs != 0, axis=0)

# Plot the number of nonzero coefficients versus alpha
plt.figure(figsize=(8,6))
plt.plot(alphas, nonzero_counts, marker='o', linestyle='-')
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Number of nonzero coefficients')
plt.title('Nonzero Coefficients vs. Alpha')
#plt.gca().invert_xaxis()  # Lower alphas (less regularization) on the right
plt.axvline(x=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.legend()
plt.show()

# Compute the lasso path. Note: we use np.log(y_train) because that's what you used in LassoCV.
alphas, coefs, _ = lasso_path(X_train.values, np.log(y_train.values), alphas=lasso_cv.alphas_, max_iter=100000)

plt.figure(figsize=(8, 6))
# Iterate over each predictor and plot its coefficient path.
for i, col in enumerate(X_train.columns):
    plt.plot(alphas, coefs[i, :], label=col)

plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficient value')
plt.title('Lasso Coefficient Paths')
#plt.gca().invert_xaxis()  # Lower alphas (weaker regularization) to the right.
#plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.axvline(x=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
#plt.legend()
plt.show()

# RidgeCV with a list of alpha values and 3-fold cross-validation
from sklearn.linear_model import RidgeCV, Ridge

alpha_max = 10
alpha_min_ratio = 1e-4
alpha_min = alpha_max * alpha_min_ratio

# Define candidate alpha values
alphas = np.logspace(np.log(alpha_max), np.log(alpha_min), num=5)
alphas

array([2.00717432e+02, 1.00000000e+00, 4.98212830e-03, 2.48216024e-05,
       1.23664407e-07])


ridge_cv = RidgeCV(alphas=alphas, cv=3, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_np, np.log(y_train_np))

print("RidgeCV - Best alpha:", ridge_cv.alpha_)

# Create a DataFrame including the intercept and the coefficients:
coef_ridge = pd.DataFrame({
    'predictor': list(X_train.columns),
    'coefficient': list(ridge_cv.coef_),
    'exp_coefficient': np.exp( list(ridge_cv.coef_) )
})

# Evaluate
y_pred_ridge = ridge_cv.predict(X_test_np)
mse_ridge = mean_squared_error(y_test_np, y_pred_ridge)
print("RidgeCV - MSE:", mse_ridge)

RidgeCV - Best alpha: 200.71743249053017
RidgeCV - MSE: 88099136.93994579

coef_ridge

	predictor	coefficient	exp_coefficient
0	atdmt.com	-0.004977	0.995035
1	yahoo.com	-0.017194	0.982953
2	whenu.com	-0.008336	0.991699
3	weatherbug.com	-0.005399	0.994616
4	msn.com	-0.004900	0.995112
...	...	...	...
995	effectivebrand.com	-0.098469	0.906223
996	dallascowboys.com	-0.039989	0.960800
997	leadgenetwork.com	0.009274	1.009317
998	in.us	-0.033527	0.967029
999	vistaprint.com	0.066156	1.068394

1000 rows × 3 columns

coef_ridge.query('coefficient != 0').shape[0]

ridge = Ridge()
coefs = []

for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(X_train_np, y_train_np)
    coefs.append(ridge.coef_)

ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
#ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization');