Classwork 9
Linear Regression II
Setup for PySpark, UDFs, and Plots
Required Libraries and SparkSession
Entry Point
import pandas as pd
import numpy as np
from tabulate import tabulate # for table summary
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm # for lowess smoothing
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, col, pow, mean, avg, when, log, sqrt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
= SparkSession.builder.master("local[*]").getOrCreate() spark
UDF for Regression Tables
def regression_table(model, assembler):
"""
Creates a formatted regression table from a fitted LinearRegression model and its VectorAssembler.
If the model’s labelCol (retrieved using getLabelCol()) starts with "log", an extra column showing np.exp(coeff)
is added immediately after the beta estimate column for predictor rows. Additionally, np.exp() of the 95% CI
Lower and Upper bounds is also added unless the predictor's name includes "log_". The Intercept row does not
include exponentiated values.
When labelCol starts with "log", the columns are ordered as:
y: [label] | Beta | Exp(Beta) | Sig. | Std. Error | p-value | 95% CI Lower | Exp(95% CI Lower) | 95% CI Upper | Exp(95% CI Upper)
Otherwise, the columns are:
y: [label] | Beta | Sig. | Std. Error | p-value | 95% CI Lower | 95% CI Upper
Parameters:
model: A fitted LinearRegression model (with a .summary attribute and a labelCol).
assembler: The VectorAssembler used to assemble the features for the model.
Returns:
A formatted string containing the regression table.
"""
# Determine if we should display exponential values for coefficients.
= model.getLabelCol().lower().startswith("log")
is_log
# Extract coefficients and standard errors as NumPy arrays.
= model.coefficients.toArray()
coeffs = np.array(model.summary.coefficientStandardErrors)
std_errors_all
# Check if the intercept's standard error is included (one extra element).
if len(std_errors_all) == len(coeffs) + 1:
= std_errors_all[0]
intercept_se = std_errors_all[1:]
std_errors else:
= None
intercept_se = std_errors_all
std_errors
# Use provided tValues and pValues.
= model.summary.numInstances - len(coeffs) - 1
df = stats.t.ppf(0.975, df)
t_critical = model.summary.pValues
p_values
# Helper: significance stars.
def significance_stars(p):
if p < 0.01:
return "***"
elif p < 0.05:
return "**"
elif p < 0.1:
return "*"
else:
return ""
# Build table rows for each feature.
= []
table for feature, beta, se, p in zip(assembler.getInputCols(), coeffs, std_errors, p_values):
= beta - t_critical * se
ci_lower = beta + t_critical * se
ci_upper
# Check if predictor contains "log_" to determine if exponentiation should be applied
= is_log and "log_" not in feature.lower()
apply_exp
= np.exp(beta) if apply_exp else ""
exp_beta = np.exp(ci_lower) if apply_exp else ""
exp_ci_lower = np.exp(ci_upper) if apply_exp else ""
exp_ci_upper
if is_log:
table.append([# Predictor name
feature, # Beta estimate
beta, # Exponential of beta (or blank)
exp_beta,
significance_stars(p),
se,
p,
ci_lower,# Exponential of 95% CI lower bound
exp_ci_lower,
ci_upper,# Exponential of 95% CI upper bound
exp_ci_upper
])else:
table.append([
feature,
beta,
significance_stars(p),
se,
p,
ci_lower,
ci_upper
])
# Process intercept.
if intercept_se is not None:
= model.summary.pValues[0] if model.summary.pValues is not None else None
intercept_p = significance_stars(intercept_p)
intercept_sig = model.intercept - t_critical * intercept_se
ci_intercept_lower = model.intercept + t_critical * intercept_se
ci_intercept_upper else:
= ""
intercept_sig = ""
ci_intercept_lower = ""
ci_intercept_upper = ""
intercept_se
if is_log:
table.append(["Intercept",
model.intercept,"", # Removed np.exp(model.intercept)
intercept_sig,
intercept_se,"",
ci_intercept_lower,"",
ci_intercept_upper,""
])else:
table.append(["Intercept",
model.intercept,
intercept_sig,
intercept_se,"",
ci_intercept_lower,
ci_intercept_upper
])
# Append overall model metrics.
if is_log:
"Observations", model.summary.numInstances, "", "", "", "", "", "", "", ""])
table.append(["R²", model.summary.r2, "", "", "", "", "", "", "", ""])
table.append(["RMSE", model.summary.rootMeanSquaredError, "", "", "", "", "", "", "", ""])
table.append([else:
"Observations", model.summary.numInstances, "", "", "", "", ""])
table.append(["R²", model.summary.r2, "", "", "", "", ""])
table.append(["RMSE", model.summary.rootMeanSquaredError, "", "", "", "", ""])
table.append([
# Format the table rows.
= []
formatted_table for row in table:
= []
formatted_row for i, item in enumerate(row):
# Format Observations as integer with commas.
if row[0] == "Observations" and i == 1 and isinstance(item, (int, float, np.floating)) and item != "":
f"{int(item):,}")
formatted_row.append(elif isinstance(item, (int, float, np.floating)) and item != "":
if is_log:
# When is_log, the columns are:
# 0: Metric, 1: Beta, 2: Exp(Beta), 3: Sig, 4: Std. Error, 5: p-value,
# 6: 95% CI Lower, 7: Exp(95% CI Lower), 8: 95% CI Upper, 9: Exp(95% CI Upper).
if i in [1, 2, 4, 6, 7, 8, 9]:
f"{item:,.3f}")
formatted_row.append(elif i == 5:
f"{item:.3f}")
formatted_row.append(else:
f"{item:.3f}")
formatted_row.append(else:
# When not is_log, the columns are:
# 0: Metric, 1: Beta, 2: Sig, 3: Std. Error, 4: p-value, 5: 95% CI Lower, 6: 95% CI Upper.
if i in [1, 3, 5, 6]:
f"{item:,.3f}")
formatted_row.append(elif i == 4:
f"{item:.3f}")
formatted_row.append(else:
f"{item:.3f}")
formatted_row.append(else:
formatted_row.append(item)
formatted_table.append(formatted_row)
# Set header and column alignment based on whether label starts with "log"
if is_log:
= [
headers f"y: {model.getLabelCol()}",
"Beta", "Exp(Beta)", "Sig.", "Std. Error", "p-value",
"95% CI Lower", "Exp(95% CI Lower)", "95% CI Upper", "Exp(95% CI Upper)"
]= ("left", "right", "right", "center", "right", "right", "right", "right", "right", "right")
colalign else:
= [f"y: {model.getLabelCol()}", "Beta", "Sig.", "Std. Error", "p-value", "95% CI Lower", "95% CI Upper"]
headers = ("left", "right", "center", "right", "right", "right", "right")
colalign
= tabulate(
table_str
formatted_table,=headers,
headers="pretty",
tablefmt=colalign
colalign
)
# Insert a dashed line after the Intercept row.
= table_str.split("\n")
lines = '-' * len(lines[0])
dash_line for i, line in enumerate(lines):
if "Intercept" in line and not line.strip().startswith('+'):
+1, dash_line)
lines.insert(ibreak
return "\n".join(lines)
# Example usage:
# print(regression_table(model_1, assembler_1))
UDF for Adding Dummy Variables
def add_dummy_variables(var_name, reference_level, category_order=None):
"""
Creates dummy variables for the specified column in the global DataFrames dtrain and dtest.
Allows manual setting of category order.
Parameters:
var_name (str): The name of the categorical column (e.g., "borough_name").
reference_level (int): Index of the category to be used as the reference (dummy omitted).
category_order (list, optional): List of categories in the desired order. If None, categories are sorted.
Returns:
dummy_cols (list): List of dummy column names excluding the reference category.
ref_category (str): The category chosen as the reference.
"""
global dtrain, dtest
# Get distinct categories from the training set.
= dtrain.select(var_name).distinct().rdd.flatMap(lambda x: x).collect()
categories
# Convert booleans to strings if present.
= [str(c) if isinstance(c, bool) else c for c in categories]
categories
# Use manual category order if provided; otherwise, sort categories.
if category_order:
# Ensure all categories are present in the user-defined order
= set(categories) - set(category_order)
missing if missing:
raise ValueError(f"These categories are missing from your custom order: {missing}")
= category_order
categories else:
= sorted(categories)
categories
# Validate reference_level
if reference_level < 0 or reference_level >= len(categories):
raise ValueError(f"reference_level must be between 0 and {len(categories) - 1}")
# Define the reference category
= categories[reference_level]
ref_category print("Reference category (dummy omitted):", ref_category)
# Create dummy variables for all categories
for cat in categories:
= var_name + "_" + str(cat).replace(" ", "_")
dummy_col_name = dtrain.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))
dtrain = dtest.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))
dtest
# List of dummy columns, excluding the reference category
= [var_name + "_" + str(cat).replace(" ", "_") for cat in categories if cat != ref_category]
dummy_cols
return dummy_cols, ref_category
# Example usage without category_order:
# dummy_cols_year, ref_category_year = add_dummy_variables('year', 0)
# Example usage with category_order:
# custom_order_wkday = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
# dummy_cols_wkday, ref_category_wkday = add_dummy_variables('wkday', reference_level=0, category_order = custom_order_wkday)
UDF for Adding Interaction Terms
def add_interaction_terms(var_list1, var_list2, var_list3=None):
"""
Creates interaction term columns in the global DataFrames dtrain and dtest.
For two sets of variable names (which may represent categorical (dummy) or continuous variables),
this function creates two-way interactions by multiplying each variable in var_list1 with each
variable in var_list2.
Optionally, if a third list of variable names (var_list3) is provided, the function also creates
three-way interactions among each variable in var_list1, each variable in var_list2, and each variable
in var_list3.
Parameters:
var_list1 (list): List of column names for the first set of variables.
var_list2 (list): List of column names for the second set of variables.
var_list3 (list, optional): List of column names for the third set of variables for three-way interactions.
Returns:
A flat list of new interaction column names.
"""
global dtrain, dtest
= []
interaction_cols
# Create two-way interactions between var_list1 and var_list2.
for var1 in var_list1:
for var2 in var_list2:
= f"{var1}_*_{var2}"
col_name = dtrain.withColumn(col_name, col(var1).cast("double") * col(var2).cast("double"))
dtrain = dtest.withColumn(col_name, col(var1).cast("double") * col(var2).cast("double"))
dtest
interaction_cols.append(col_name)
# Create two-way interactions between var_list1 and var_list3.
if var_list3 is not None:
for var1 in var_list1:
for var3 in var_list3:
= f"{var1}_*_{var3}"
col_name = dtrain.withColumn(col_name, col(var1).cast("double") * col(var3).cast("double"))
dtrain = dtest.withColumn(col_name, col(var1).cast("double") * col(var3).cast("double"))
dtest
interaction_cols.append(col_name)
# Create two-way interactions between var_list2 and var_list3.
if var_list3 is not None:
for var2 in var_list2:
for var3 in var_list3:
= f"{var2}_*_{var3}"
col_name = dtrain.withColumn(col_name, col(var2).cast("double") * col(var3).cast("double"))
dtrain = dtest.withColumn(col_name, col(var2).cast("double") * col(var3).cast("double"))
dtest
interaction_cols.append(col_name)
# If a third list is provided, create three-way interactions.
if var_list3 is not None:
for var1 in var_list1:
for var2 in var_list2:
for var3 in var_list3:
= f"{var1}_*_{var2}_*_{var3}"
col_name = dtrain.withColumn(col_name, col(var1).cast("double") * col(var2).cast("double") * col(var3).cast("double"))
dtrain = dtest.withColumn(col_name, col(var1).cast("double") * col(var2).cast("double") * col(var3).cast("double"))
dtest
interaction_cols.append(col_name)
return interaction_cols
# Example
# interaction_cols_brand_price = add_interaction_terms(dummy_cols_brand, ['log_price'])
# interaction_cols_brand_ad_price = add_interaction_terms(dummy_cols_brand, dummy_cols_ad, ['log_price'])
UDF for Model Comparison
def compare_reg_models(models, assemblers, names=None):
"""
Produces a single formatted table comparing multiple regression models.
For each predictor (the union across models, ordered by first appearance), the table shows
the beta estimate (with significance stars) from each model (blank if not used).
For a predictor, if a model's outcome (model.getLabelCol()) starts with "log", the cell displays
both the beta and its exponential (separated by " / "), except when the predictor's name includes "log_".
(The intercept row does not display exp(.))
Additional rows for Intercept, Observations, R², and RMSE are appended.
The header's first column is labeled "Predictor", and subsequent columns are
"y: [outcome] ([name])" for each model.
The table is produced in grid format (with vertical lines). A dashed line (using '-' characters)
is inserted at the top, immediately after the header, and at the bottom.
Additionally, immediately after the Intercept row, the border line is replaced with one using '='
(to appear as, for example, "+==============================================+==========================+...").
Parameters:
models (list): List of fitted LinearRegression models.
assemblers (list): List of corresponding VectorAssembler objects.
names (list, optional): List of model names; defaults to "Model 1", "Model 2", etc.
Returns:
A formatted string containing the combined regression table.
"""
# Default model names.
if names is None:
= [f"Model {i+1}" for i in range(len(models))]
names
# For each model, get outcome and determine if that model is log-transformed.
= [m.getLabelCol() for m in models]
outcomes = [out.lower().startswith("log") for out in outcomes]
is_log_flags
# Build an ordered union of predictors based on first appearance.
= []
ordered_predictors for assembler in assemblers:
for feat in assembler.getInputCols():
if feat not in ordered_predictors:
ordered_predictors.append(feat)
# Helper for significance stars.
def significance_stars(p):
if p is None:
return ""
if p < 0.01:
return "***"
elif p < 0.05:
return "**"
elif p < 0.1:
return "*"
else:
return ""
# Build rows for each predictor.
= []
rows for feat in ordered_predictors:
= [feat]
row for m, a, is_log in zip(models, assemblers, is_log_flags):
= a.getInputCols()
feats_model if feat in feats_model:
= feats_model.index(feat)
idx = m.coefficients.toArray()[idx]
beta = m.summary.pValues[idx] if m.summary.pValues is not None else None
p_val = significance_stars(p_val)
stars = f"{beta:.3f}{stars}"
cell # Only add exp(beta) if model is log and predictor name does NOT include "log_"
if is_log and ("log_" not in feat.lower()):
+= f" / {np.exp(beta):,.3f}"
cell
row.append(cell)else:
"")
row.append(
rows.append(row)
# Build intercept row (do NOT compute exp(intercept)).
= ["Intercept"]
intercept_row for m in models:
= np.array(m.summary.coefficientStandardErrors)
std_all = m.coefficients.toArray()
coeffs if len(std_all) == len(coeffs) + 1:
= m.summary.pValues[0] if m.summary.pValues is not None else None
intercept_p else:
= None
intercept_p = significance_stars(intercept_p)
sig = f"{m.intercept:.3f}{sig}"
cell
intercept_row.append(cell)
rows.append(intercept_row)
# Add Observations row.
= ["Observations"]
obs_row for m in models:
= m.summary.numInstances
obs f"{int(obs):,}")
obs_row.append(
rows.append(obs_row)
# Add R² row.
= ["R²"]
r2_row for m in models:
f"{m.summary.r2:.3f}")
r2_row.append(
rows.append(r2_row)
# Add RMSE row.
= ["RMSE"]
rmse_row for m in models:
f"{m.summary.rootMeanSquaredError:.3f}")
rmse_row.append(
rows.append(rmse_row)
# Build header: first column "Predictor", then for each model: "y: [outcome] ([name])"
= ["Predictor"]
header for out, name in zip(outcomes, names):
f"y: {out} ({name})")
header.append(
# Create table string using grid format.
= tabulate(rows, headers=header, tablefmt="grid", colalign=("left",) + ("right",)*len(models))
table_str
# Split into lines.
= table_str.split("\n")
lines
# Create a dashed line spanning the full width.
= len(lines[0])
full_width = '-' * full_width
dash_line # Create an equals line by replacing '-' with '='.
= dash_line.replace('-', '=')
eq_line
# Insert a dashed line after the header row.
= table_str.split("\n")
lines # In grid format, header and separator are usually the first two lines.
2, dash_line)
lines.insert(
# Insert an equals line after the Intercept row.
for i, line in enumerate(lines):
if line.startswith("|") and "Intercept" in line:
if i+1 < len(lines):
+1] = eq_line
lines[ibreak
# Add dashed lines at the very top and bottom.
= dash_line + "\n" + "\n".join(lines) + "\n" + dash_line
final_table
return final_table
# Example usage:
# print(compare_regression_models([model_1, model_2, model_3],
# [assembler_1, assembler_2, assembler_3],
# ["Model 1", "Model 2", "Model 3"]))
UDF for RMSEs
def compare_rmse(test_dfs, label_col, pred_col="prediction", names=None):
"""
Computes and compares RMSE values for a list of test DataFrames.
For each DataFrame in test_dfs, this function calculates the RMSE between the actual outcome
(given by label_col) and the predicted value (given by pred_col, default "prediction"). It then
produces a formatted table where the first column header is empty and the first row's first cell is
"RMSE", with each model's RMSE in its own column.
Parameters:
test_dfs (list): List of test DataFrames.
label_col (str): The name of the outcome column.
pred_col (str, optional): The name of the prediction column (default "prediction").
names (list, optional): List of model names corresponding to the test DataFrames.
Defaults to "Model 1", "Model 2", etc.
Returns:
A formatted string containing a table that compares RMSE values for each test DataFrame,
with one model per column.
"""
# Set default model names if none provided.
if names is None:
= [f"Model {i+1}" for i in range(len(test_dfs))]
names
= []
rmse_values for df in test_dfs:
# Create a column for squared error.
= df.withColumn("error_sq", pow(col(label_col) - col(pred_col), 2))
df # Calculate RMSE: square root of the mean squared error.
= df.agg(sqrt(avg("error_sq")).alias("rmse")).collect()[0]["rmse"]
rmse
rmse_values.append(rmse)
# Build a single row table: first cell "RMSE", then one cell per model with the RMSE value.
= ["RMSE"] + [f"{rmse:.3f}" for rmse in rmse_values]
row
# Build header: first column header is empty, then model names.
= [""] + names
header
= tabulate([row], headers=header, tablefmt="grid", colalign=("left",) + ("right",)*len(names))
table_str return table_str
# Example usage:
# print(compare_rmse([dtest_1, dtest_2, dtest_3], "log_sales", names=["Model 1", "Model 2", "Model 3"]))
UDF for a Residual Plot
def residual_plot(df, label_col, model_name):
"""
Generates a residual plot for a given test dataframe.
Parameters:
df (DataFrame): Spark DataFrame containing the test set with predictions.
label_col (str): The column name of the actual outcome variable.
title (str): The title for the residual plot.
Returns:
None (displays the plot)
"""
# Convert to Pandas DataFrame
= df.select(["prediction", label_col]).toPandas()
df_pd "residual"] = df_pd[label_col] - df_pd["prediction"]
df_pd[
# Scatter plot of residuals vs. predicted values
"prediction"], df_pd["residual"], alpha=0.2, color="darkgray")
plt.scatter(df_pd[
# Use LOWESS smoothing for trend line
= sm.nonparametric.lowess(df_pd["residual"], df_pd["prediction"])
smoothed 0], smoothed[:, 1], color="darkblue")
plt.plot(smoothed[:,
# Add reference line at y=0
=0, color="red", linestyle="--")
plt.axhline(y
# Labels and title (model_name)
"Predicted Values")
plt.xlabel("Residuals")
plt.ylabel(= "Residual Plot for " + model_name
model_name
plt.title(model_name)
# Show plot
plt.show()
# Example usage:
# residual_plot(dtest_1, "log_sales", "Model 1")
Coefficient Plots
= assembler3.getInputCols()
terms = model3.coefficients.toArray()[:len(terms)]
coefs = model3.summary.coefficientStandardErrors[:len(terms)]
stdErrs
= pd.DataFrame({
df_summary "term": terms,
"estimate": coefs,
"std_error": stdErrs
})
# Filter df_summary if needed
# Plot using the DataFrame columns
"term"], df_summary["estimate"],
plt.errorbar(df_summary[= 1.96 * df_summary["std_error"], fmt='o', capsize=5)
yerr "Terms")
plt.xlabel("Coefficient Estimate")
plt.ylabel("Coefficient Estimates (Model 2)")
plt.title(0, color="red", linestyle="--") # Add horizontal line at 0
plt.axhline(=45)
plt.xticks(rotation plt.show()
Histogram
# Create a histogram
= DATAFRAME.select(["Y_VARIABLE"]).toPandas()
dfpd "Y_VARIABLE"], bins=10, kde=True) sns.histplot(dfpd[
Question 1
= pd.read_csv(https://bcdanl.github.io/data/dominick_oj_feat.csv) oj
- Convert the
oj
Pandas DataFrame into the PySpark DataFrame object with the name,df
.- What are continuous variables? What are categorical variables?
- For each continuous variable, provide descriptive statistics for each OJ brand.
Variable description
Variable | Description |
---|---|
sales |
Quantity of OJ cartons sold |
price |
Price of OJ |
brand |
Brand of OJ |
ad |
Advertisement status |
Question 2
- Add the following log variables:
log_sales
: (\(\log(\text{sales})\))log_price
: (\(\log(\text{price})\))
= (
df
df"log_sales",
.withColumn('sales']) )
log(df["log_price",
.withColumn('price']) )
log(df[ )
- Then, divide the
df
DataFrame into training and test DataFrames.- Use
dtrain
anddtest
for training and test DataFrames, respectively. - 70% of observations in the
df
are assigned todtrain
; the rest is assigned todtest
.
- Use
Question 3
Train the following linear regression model. Provide the summary of the regression result.
- Set “dominicks” as the reference level for the \(\text{brand}\) variable.
- Set “No Ad” as the reference level for the \(\text{ad_status}\) variable.
Model 1
\[ \begin{align} \log(\text{sales}_{\text{i}}) &\,=\, \;\; b_{\text{intercept}} \,+\, b_{\,\text{mm}}\,\text{brand}_{\,\text{mm}, \text{i}} \,+\, b_{\,\text{tr}}\,\text{brand}_{\,\text{tr}, \text{i}}\\ &\quad\,+\, b_{\text{price}}\,\log(\text{price}_{\text{i}}) \,+\, e_{\text{i}},\\ \text{where}\qquad\qquad&\\ \text{brand}_{\,\text{tr}, \text{i}} &\,=\, \begin{cases} \text{1} & \text{ if an orange juice } \text{i} \text{ is } \text{Tropicana};\\\\ \text{0} & \text{otherwise}.\qquad\qquad\quad\, \end{cases}\\ \text{brand}_{\,\text{mm}, \text{i}} &\,=\, \begin{cases} \text{1} & \text{ if an orange juice } \text{i} \text{ is } \text{Minute Maid};\\\\ \text{0} & \text{otherwise}.\qquad\qquad\quad\, \end{cases} \end{align} \]
Model 2
\[ \begin{align} \log(\text{sales}_{\text{i}}) \,=\,&\;\; \quad b_{\text{intercept}} \,+\, \color{Green}{b_{\,\text{mm}}\,\text{brand}_{\,\text{mm}, \text{i}}} \,+\, \color{Blue}{b_{\,\text{tr}}\,\text{brand}_{\,\text{tr}, \text{i}}}\\ &\,+\, b_{\text{price}}\,\log(\text{price}_{\text{i}}) \\ &\, +\, b_{\text{price*mm}}\,\log(\text{price}_{\text{i}})\,\times\,\color{Green} {\text{brand}_{\,\text{mm}, \text{i}}} \\ &\,+\, b_{\text{price*tr}}\,\log(\text{price}_{\text{i}})\,\times\,\color{Blue} {\text{brand}_{\,\text{tr}, \text{i}}} \,+\, e_{\text{i}} \end{align} \]
Model 3
\[ \begin{align} \log(\text{sales}_{\text{i}}) \,=\,\quad\;\;& b_{\text{intercept}} \,+\, \color{Green}{b_{\,\text{mm}}\,\text{brand}_{\,\text{mm}, \text{i}}} \,+\, \color{Blue}{b_{\,\text{tr}}\,\text{brand}_{\,\text{tr}, \text{i}}} \\ &\,+\; b_{\,\text{ad}}\,\color{Orange}{\text{ad}_{\,\text{i}}} \qquad\qquad\qquad\qquad\quad \\ &\,+\, b_{\text{mm*ad}}\,\color{Green} {\text{brand}_{\,\text{mm}, \text{i}}}\,\times\, \color{Orange}{\text{ad}_{\,\text{i}}}\,+\, b_{\text{tr*ad}}\,\color{Blue} {\text{brand}_{\,\text{tr}, \text{i}}}\,\times\, \color{Orange}{\text{ad}_{\,\text{i}}} \\ &\,+\; b_{\text{price}}\,\log(\text{price}_{\text{i}}) \qquad\qquad\qquad\;\;\;\;\, \\ &\,+\, b_{\text{price*mm}}\,\log(\text{price}_{\text{i}})\,\times\,\color{Green} {\text{brand}_{\,\text{mm}, \text{i}}}\qquad\qquad\qquad\;\, \\ &\,+\, b_{\text{price*tr}}\,\log(\text{price}_{\text{i}})\,\times\,\color{Blue} {\text{brand}_{\,\text{tr}, \text{i}}}\qquad\qquad\qquad\;\, \\ & \,+\, b_{\text{price*ad}}\,\log(\text{price}_{\text{i}})\,\times\,\color{Orange}{\text{ad}_{\,\text{i}}}\qquad\qquad\qquad\;\;\, \\ &\,+\, b_{\text{price*mm*ad}}\,\log(\text{price}_{\text{i}}) \,\times\,\,\color{Green} {\text{brand}_{\,\text{mm}, \text{i}}}\,\times\, \color{Orange}{\text{ad}_{\,\text{i}}} \\ &\,+\, b_{\text{price*tr*ad}}\,\log(\text{price}_{\text{i}}) \,\times\,\,\color{Blue} {\text{brand}_{\,\text{tr}, \text{i}}}\,\times\, \color{Orange}{\text{ad}_{\,\text{i}}} \,+\, e_{\text{i}} \end{align} \]
Question 4
For each model, make a prediction on the outcome variable using the test DataFrame and the regression result from Question 3.
Question 5
Across the three models, how is the percentage change in the price of OJ sensitive to the percentage change in the OJ purchases for each brand?
How does
promo
affect such sensitivity in the Model 3?
Question 6
- Compare RMSEs using a test DataFrame across the models.
Question 7
- Draw a residual plot from each of the three models.
- On average, are the prediction correct? Are there systematic errors?
Question 8
How would you explain different estimation results across different models?
Which model do you prefer? Why?