import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale # zero mean & one s.d.
from sklearn.linear_model import LassoCV, lasso_path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
Homework 4 - Part 1: Lasso Linear Regerssion - Model 3 with Discussions
Beer Markets with Big Demographic Design
#beer = pd.read_csv("https://bcdanl.github.io/data/beer_markets_xbeer_xdemog.zip")
#beer = pd.read_csv("https://bcdanl.github.io/data/beer_markets_xbeer_brand_xdemog.zip")
= pd.read_csv("https://bcdanl.github.io/data/beer_markets_xbeer_brand_promo_xdemog.zip") beer
= beer.drop('ylogprice', axis = 1)
X = beer['ylogprice'] y
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
= y_train.values
y_train = y_test.values y_test
# LassoCV with a range of alpha values
= LassoCV(n_alphas = 100,
lasso_cv = None, # alphas=None automatically generate 100 candidate alpha values
alphas = 5,
cv =42,
random_state=100000) max_iter
lasso_cv.fit(X_train.values, y_train)#lasso_cv.fit(X_train.values, y_train.ravel())
print("LassoCV - Best alpha:", lasso_cv.alpha_)
LassoCV - Best alpha: 0.00022539867869301466
# Create a DataFrame including the intercept and the coefficients:
= pd.DataFrame({
coef_lasso_beer 'predictor': list(X_train.columns),
'coefficient': list(lasso_cv.coef_),
'exp_coefficient': np.exp( list(lasso_cv.coef_) )
})
# Evaluate
= lasso_cv.predict(X_test)
y_pred_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_lasso print("LassoCV - MSE:", mse_lasso)
LassoCV - MSE: 0.02863537005154527
= coef_lasso_beer[coef_lasso_beer['coefficient'] != 0] coef_lasso_beer_n0
1] X_train.shape[
2655
0] coef_lasso_beer_n0.shape[
163
coef_lasso_beer_n0
predictor | coefficient | exp_coefficient | |
---|---|---|---|
0 | logquantity | -0.136277 | 0.872601 |
1 | container_CAN | -0.056243 | 0.945309 |
2 | brandBUSCH_LIGHT | -0.072086 | 0.930451 |
4 | brandMILLER_LITE | 0.001853 | 1.001854 |
5 | brandNATURAL_LIGHT | -0.457505 | 0.632861 |
... | ... | ... | ... |
2372 | marketRURAL_WEST_VIRGINIA:npeople2 | -0.032313 | 0.968203 |
2385 | marketTAMPA:npeople2 | 0.018947 | 1.019127 |
2386 | marketURBAN_NY:npeople2 | 0.001501 | 1.001502 |
2426 | marketRALEIGH-DURHAM:npeople3 | -0.005387 | 0.994627 |
2582 | marketEXURBAN_NJ:npeople5plus | 0.080731 | 1.084079 |
163 rows × 3 columns
# Compute the mean and standard deviation of the CV errors for each alpha.
= np.mean(lasso_cv.mse_path_, axis=1)
mean_cv_errors = np.std(lasso_cv.mse_path_, axis=1)
std_cv_errors
=(8, 6))
plt.figure(figsize=std_cv_errors, marker='o', linestyle='-', capsize=5)
plt.errorbar(lasso_cv.alphas_, mean_cv_errors, yerr'log')
plt.xscale('Alpha')
plt.xlabel('Mean CV Error (MSE)')
plt.ylabel('Cross-Validation Error vs. Alpha')
plt.title(#plt.gca().invert_xaxis() # Optionally invert the x-axis so lower alphas (less regularization) appear to the right.
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x
plt.legend() plt.show()
# Compute the coefficient path over the alpha grid that LassoCV used
= lasso_path(X_train, y_train,
alphas, coefs, _ =lasso_cv.alphas_,
alphas=100000)
max_iter
# Count nonzero coefficients for each alpha (coefs shape: (n_features, n_alphas))
= np.sum(coefs != 0, axis=0)
nonzero_counts
# Plot the number of nonzero coefficients versus alpha
=(8,6))
plt.figure(figsize='o', linestyle='-')
plt.plot(alphas, nonzero_counts, marker'log')
plt.xscale('Alpha')
plt.xlabel('Number of nonzero coefficients')
plt.ylabel('Nonzero Coefficients vs. Alpha')
plt.title(#plt.gca().invert_xaxis() # Lower alphas (less regularization) on the right
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x
plt.legend() plt.show()
# Compute the lasso path. Note: we use np.log(y_train) because that's what you used in LassoCV.
= lasso_path(X_train, y_train,
alphas, coefs, _ =lasso_cv.alphas_,
alphas=100000)
max_iter=(8, 6))
plt.figure(figsize# Iterate over each predictor and plot its coefficient path.
for i, col in enumerate(X_train.columns):
=col)
plt.plot(alphas, coefs[i, :], label
'log')
plt.xscale('Alpha')
plt.xlabel('Coefficient value')
plt.ylabel('Lasso Coefficient Paths')
plt.title(#plt.gca().invert_xaxis() # Lower alphas (weaker regularization) to the right.
#plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
=lasso_cv.alpha_, color='red', linestyle='--', label='Best alpha')
plt.axvline(x#plt.legend()
plt.show()
Best Alphas
Model 1: 0.000082
Model 2: 0.000225
Model 3: 0.000225
Sensitivity Estimates without Demographic Controls (Homework 2)
Model 1 | Model 2 | Model 3 (no Promo) | Model 3 (with Promo) | |
---|---|---|---|---|
BUD | -0.142 | -0.146 | -0.140 | -0.148 = −0.140 − 0.008 |
BUSCH | -0.142 | -0.159 = −0.146 − 0.013 |
-0.161 = −0.140 − 0.021 |
-0.122 = −0.140 − 0.021 − 0.008 + 0.047 |
COORS | -0.142 | -0.146 = −0.146 − 0 |
-0.148 = −0.140 − 0.008 |
-0.119 = −0.140 − 0.008 − 0.008 + 0.037 |
MILLER | -0.142 | -0.163 = −0.146 − 0.017 |
-0.163 = −0.140 − 0.023 |
-0.119 = −0.140 − 0.023 − 0.008 + 0.052 |
NATURAL | -0.142 | -0.094 = −0.146 + 0.052 |
-0.103 = −0.140 + 0.037 |
-0.040 = −0.140 + 0.037 − 0.008 + 0.071 |
Sensitivity Estimates with Demographic Controls
Model 1 | Model 2 | Model 3 (no Promo) | Model 3 (with Promo) | |
---|---|---|---|---|
BUD | -0.1429 | -0.1408 | -0.1363 | -0.1403 = −0.1363 + 0.0040 |
BUSCH | -0.1429 | -0.1719 = −0.1408 − 0.0311 |
-0.1708 = −0.1363 − 0.0345 |
-0.1418 = −0.1363 − 0.0345 + 0.0040 + 0.025 |
COORS | -0.1429 | -0.1414 = −0.1408 − 0.0006 |
-0.1365 = −0.1363 − 0.0002 |
-0.1355 = −0.1363 − 0.0002 + 0.0040 - 0.0030 |
MILLER | -0.1429 | -0.1443 = −0.1408 − 0.0035 |
-0.1401 = −0.1363 − 0.0038 |
-0.1353 = −0.1363 − 0.0038 + 0.0040 + 0.0008 |
NATURAL | -0.1429 | -0.1113 = −0.1408 + 0.0295 |
-0.1105 = −0.1363 + 0.0258 |
-0.1058 = −0.1363 + 0.0258 + 0.0040 + 0.0007 |
Sensitivity Visuals without Demographic Controls
# Slopes (inverse elasticities)
= -0.142
model1_slope = {
model2_slopes 'Bud': -0.146,
'Busch': -0.159,
'Coors': -0.146,
'Miller': -0.163,
'Natural': -0.094
}= {
model3_full_slopes 'Bud': -0.140,
'Busch': -0.161,
'Coors': -0.148,
'Miller': -0.163,
'Natural': -0.103
}= {
model3_promo_slopes 'Bud': -0.148,
'Busch': -0.122,
'Coors': -0.119,
'Miller': -0.096,
'Natural': -0.077
}
= -0.1429
model1_demog_slope = {
model2_demog_slopes 'Bud': -0.1408,
'Busch': -0.1719,
'Coors': -0.1414,
'Miller': -0.1443,
'Natural': -0.1113
}= {
model3_demog_full_slopes 'Bud': -0.1363,
'Busch': -0.1708,
'Coors': -0.1365,
'Miller': -0.1401,
'Natural': -0.1105
}= {
model3_demog_promo_slopes 'Bud': -0.1403,
'Busch': -0.1418,
'Coors': -0.1355,
'Miller': -0.1353,
'Natural': -0.1058
}
# Create range of log(sales) values
= np.linspace(0, 10, 100)
x
# Set up a 2x4 grid of subplots
= plt.subplots(2, 4, figsize=(16, 8), sharex=True, sharey=True)
fig, axes
# First row: models without demographics
# Model 1
= axes[0, 0]
ax * x, color='tab:blue', label='All Brands')
ax.plot(x, model1_slope 'Model 1')
ax.set_title('log(price)')
ax.set_ylabel(True)
ax.grid(='lower left')
ax.legend(loc
# Model 2
= axes[0, 1]
ax for brand, slope in model2_slopes.items():
* x, label=brand)
ax.plot(x, slope 'Model 2')
ax.set_title(True)
ax.grid(='lower left')
ax.legend(loc
# Model 3 Full (No Promo)
= axes[0, 2]
ax for brand, slope in model3_full_slopes.items():
* x, label=brand)
ax.plot(x, slope 'Model 3 Full')
ax.set_title(True)
ax.grid(='lower left')
ax.legend(loc
# Model 3 Promo
= axes[0, 3]
ax for brand, slope in model3_promo_slopes.items():
* x, label=brand)
ax.plot(x, slope 'Model 3 Promo')
ax.set_title(True)
ax.grid(='lower left')
ax.legend(loc
# Second row: models with demographics
# Model 1 + Demog
= axes[1, 0]
ax * x, color='tab:orange', label='All Brands + Demog')
ax.plot(x, model1_demog_slope 'Model 1 (+ Demog)')
ax.set_title('log(price)')
ax.set_ylabel('log(sales)')
ax.set_xlabel(True)
ax.grid(='lower left')
ax.legend(loc
# Model 2 + Demog
= axes[1, 1]
ax for brand, slope in model2_demog_slopes.items():
* x, label=brand)
ax.plot(x, slope 'Model 2 (+ Demog)')
ax.set_title('log(sales)')
ax.set_xlabel(True)
ax.grid(='lower left')
ax.legend(loc
# Model 3 Full + Demog
= axes[1, 2]
ax for brand, slope in model3_demog_full_slopes.items():
* x, label=brand)
ax.plot(x, slope 'Model 3 Full (+ Demog)')
ax.set_title('log(sales)')
ax.set_xlabel(True)
ax.grid(='lower left')
ax.legend(loc
# Model 3 Promo + Demog
= axes[1, 3]
ax for brand, slope in model3_demog_promo_slopes.items():
* x, label=brand)
ax.plot(x, slope 'Model 3 Promo (+ Demog)')
ax.set_title('log(sales)')
ax.set_xlabel(True)
ax.grid(='lower left')
ax.legend(loc
plt.tight_layout() plt.show()
How Demographic Controls Affect Price–Volume Sensitivity
Adding demographic covariates shifts the implied own‑price elasticities by varying amounts across models and brands.
Model 1 (Common Elasticity)
Specification | Elasticity |
---|---|
Without demographics | –0.1420 |
With demographics | –0.1429 |
Δ | –0.0009 |
Effect | Slightly more elastic (negligible change) |
Model 2 (Brand‑Specific Elasticities)
Brand | Without Demog | With Demog | Δ | Effect |
---|---|---|---|---|
Bud | –0.1460 | –0.1408 | +0.0052 | Less elastic |
Busch | –0.1590 | –0.1719 | –0.0129 | More elastic |
Coors | –0.1460 | –0.1414 | +0.0046 | Less elastic |
Miller | –0.1630 | –0.1443 | +0.0187 | Much less elastic |
Natural | –0.0940 | –0.1113 | –0.0173 | More elastic |
Interpretation: Demographics slightly dampen sensitivity for Bud, Coors, and Miller but increase it for Busch and Natural.
Model 3 Full‑Price (No Promo)
Brand | Without Demog | With Demog | Δ | Effect |
---|---|---|---|---|
Bud | –0.1400 | –0.1363 | +0.0037 | Less elastic |
Busch | –0.1610 | –0.1708 | –0.0098 | More elastic |
Coors | –0.1480 | –0.1365 | +0.0115 | Less elastic |
Miller | –0.1630 | –0.1401 | +0.0229 | Much less elastic |
Natural | –0.1030 | –0.1105 | –0.0075 | More elastic |
Interpretation: Similar pattern to Model 2; demographic controls modestly increase or decrease brand‑specific slopes.
Model 3 Promotional‑Price
Brand | Without Demog | With Demog | Δ | Effect |
---|---|---|---|---|
Bud | –0.1480 | –0.1403 | +0.0077 | Less elastic |
Busch | –0.1220 | –0.1418 | –0.0198 | More elastic |
Coors | –0.1190 | –0.1355 | –0.0165 | More elastic |
Miller | –0.0960 | –0.1353 | –0.0393 | Much more elastic |
Natural | –0.0770 | –0.1058 | –0.0288 | More elastic |
Interpretation: Demographic adjustments have the largest impact here, especially increasing sensitivity for Miller and others.
Summary
- Model 1: Virtually no change when demographics are added.
- Models 2 & 3 without Promo: Elasticities shift modestly by brand—some become slightly more price‑sensitive, others slightly less.
- Model 3 Promo: Most brands become notably more elastic once demographics are included. This implies that consumer characteristics play a key role in how promotions affect beer pricing.
Test Set RMSE Comparison
Model | RMSE (no demogs) | RMSE (with demogs) |
---|---|---|
1 | 0.167 | 0.0278 |
2 | 0.166 | 0.0290 |
3 | 0.165 | 0.0286 |
Including demographic controls dramatically lowers the prediction error for all three models, implying that adding demogs substantially improves prediction performance.