import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale # zero mean & one s.d.
from sklearn.linear_model import LassoCV, lasso_path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
Homework 4 - Part 1: Lasso Linear Regerssion - Model 1
Beer Markets with Big Demographic Design
= pd.read_csv("https://bcdanl.github.io/data/beer_markets_xbeer_xdemog.zip")
beer #beer = pd.read_csv("https://bcdanl.github.io/data/beer_markets_xbeer_brand_xdemog.zip")
#beer = pd.read_csv("https://bcdanl.github.io/data/beer_markets_xbeer_brand_promo_xdemog.zip")
= beer.drop('ylogprice', axis = 1)
X = beer['ylogprice'] y
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
= y_train.values
y_train = y_test.values y_test
# LassoCV with a range of alpha values
= LassoCV(n_alphas = 100,
lasso_cv = None, # alphas=None automatically generate 100 candidate alpha values
alphas = 5,
cv =42,
random_state=100000) max_iter
lasso_cv.fit(X_train.values, y_train)#lasso_cv.fit(X_train.values, y_train.ravel())
print("LassoCV - Best alpha:", lasso_cv.alpha_)
LassoCV - Best alpha: 8.170612772733373e-05
# Create a DataFrame including the intercept and the coefficients:
= pd.DataFrame({
coef_lasso_beer 'predictor': list(X_train.columns),
'coefficient': list(lasso_cv.coef_),
'exp_coefficient': np.exp( list(lasso_cv.coef_) )
})
# Evaluate
= lasso_cv.predict(X_test)
y_pred_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_lasso print("LassoCV - MSE:", mse_lasso)
LassoCV - MSE: 0.027828386344520374
= coef_lasso_beer[coef_lasso_beer['coefficient'] != 0] coef_lasso_beer_n0
1] X_train.shape[
2641
0] coef_lasso_beer_n0.shape[
394
coef_lasso_beer_n0
predictor | coefficient | exp_coefficient | |
---|---|---|---|
0 | logquantity | -0.142898 | 0.866842 |
1 | container_CAN | -0.054903 | 0.946577 |
2 | brandBUSCH_LIGHT | -0.258170 | 0.772464 |
4 | brandMILLER_LITE | -0.015062 | 0.985051 |
5 | brandNATURAL_LIGHT | -0.315142 | 0.729685 |
... | ... | ... | ... |
2557 | marketBOSTON:npeople5plus | -0.006056 | 0.993963 |
2563 | marketCOLUMBUS:npeople5plus | -0.032305 | 0.968211 |
2564 | marketDALLAS:npeople5plus | 0.045550 | 1.046603 |
2568 | marketEXURBAN_NJ:npeople5plus | 0.127266 | 1.135720 |
2628 | marketSACRAMENTO:npeople5plus | 0.016272 | 1.016405 |
394 rows × 3 columns
# Compute the mean and standard deviation of the CV errors for each alpha.
= np.mean(lasso_cv.mse_path_, axis=1)
mean_cv_errors = np.std(lasso_cv.mse_path_, axis=1)
std_cv_errors
=(8, 6))
plt.figure(figsize=std_cv_errors, marker='o', linestyle='-', capsize=5)
plt.errorbar(lasso_cv.alphas_, mean_cv_errors, yerr'log')
plt.xscale('Alpha')
plt.xlabel('Mean CV Error (MSE)')
plt.ylabel('Cross-Validation Error vs. Alpha')
plt.title(#plt.gca().invert_xaxis() # Optionally invert the x-axis so lower alphas (less regularization) appear to the right.
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x
plt.legend() plt.show()
# Compute the coefficient path over the alpha grid that LassoCV used
= lasso_path(X_train, y_train,
alphas, coefs, _ =lasso_cv.alphas_,
alphas=100000)
max_iter
# Count nonzero coefficients for each alpha (coefs shape: (n_features, n_alphas))
= np.sum(coefs != 0, axis=0)
nonzero_counts
# Plot the number of nonzero coefficients versus alpha
=(8,6))
plt.figure(figsize='o', linestyle='-')
plt.plot(alphas, nonzero_counts, marker'log')
plt.xscale('Alpha')
plt.xlabel('Number of nonzero coefficients')
plt.ylabel('Nonzero Coefficients vs. Alpha')
plt.title(#plt.gca().invert_xaxis() # Lower alphas (less regularization) on the right
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x
plt.legend() plt.show()
# Compute the lasso path. Note: we use np.log(y_train) because that's what you used in LassoCV.
= lasso_path(X_train, y_train,
alphas, coefs, _ =lasso_cv.alphas_,
alphas=100000)
max_iter=(8, 6))
plt.figure(figsize# Iterate over each predictor and plot its coefficient path.
for i, col in enumerate(X_train.columns):
=col)
plt.plot(alphas, coefs[i, :], label
'log')
plt.xscale('Alpha')
plt.xlabel('Coefficient value')
plt.ylabel('Lasso Coefficient Paths')
plt.title(#plt.gca().invert_xaxis() # Lower alphas (weaker regularization) to the right.
#plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x#plt.legend()
plt.show()