Linear Regression II

Orange Juice Data

Author

Byeong-Hak Choe

Published

February 8, 2026

Modified

February 14, 2026

R packages

library(tidyverse)
library(broom)
library(stargazer)
library(skimr)
library(DT)

Read_statusing a CSV File

df <- read_csv("https://bcdanl.github.io/data/dominick_oj_feat.csv")

Variable description

Variable	Description
`sales`	Quantity of OJ cartons sold
`price`	Price of OJ
`brand`	Brand of OJ
`ad_status`	ad_statusvertisement status

Continuous variables

sales
price

Categorical variables

brand
ad_status

Exploratory Data Analysis

skim(df)

Data summary
Name	df
Number of rows	28947
Number of columns	4
_______________________
Column type frequency:
character	2
numeric	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
brand	0	1	9	11	0	3	0
ad_status	0	1	2	5	0	2	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
sales	0	1	17312.21	27477.66	64.00	4864.00	8384.00	17408.00	716416.00	▇▁▁▁▁
price	0	1	2.28	0.65	0.52	1.79	2.17	2.73	3.87	▁▆▇▅▂

Distribution

df |> 
  ggplot(aes(x = price, color = brand)) +
  geom_freqpoly()

Categorical Variables

df |> 
  count(brand, ad_status) |> 
  ggplot() +
  geom_col(aes(x = ad_status, y = n,
               fill = brand),
           position = "fill")

Data Preparation

df <- df |> 
  mutate(ad_status = factor(ad_status),
         ad_status = fct_relevel(ad_status, "No Ad")
         )

Training and Test Data

set.seed(1)

df <- df |> 
  mutate(rnd = runif(n()))

dtrain <- df |> 
  filter(rnd > 0.3)

dtest <- df |> 
  filter(rnd <= 0.3)

nrow(dtrain) / nrow(df)

[1] 0.6971707

Models

Model 1

\[ \begin{align} \log(\texttt{sales}_{\texttt{i}}) \,=\, &\quad\;\; b_{\texttt{intercept}} \\ &\,+\,b_{\,\texttt{mm}}\,\texttt{brand}_{\,\texttt{mm}, \texttt{i}} \,+\, b_{\,\texttt{tr}}\,\texttt{brand}_{\,\texttt{tr}, \texttt{i}}\\ &\,+\, b_{\texttt{price}}\,\log(\texttt{price}_{\texttt{i}}) \,+\, e_{\texttt{i}} \end{align} \]

Model 2

\[ \begin{align} \log(\texttt{sales}_{\texttt{i}}) \,=\,&\;\; \quad b_{\texttt{intercept}} \,+\, \color{Green}{b_{\,\texttt{mm}}\,\texttt{brand}_{\,\texttt{mm}, \texttt{i}}} \,+\, \color{Blue}{b_{\,\texttt{tr}}\,\texttt{brand}_{\,\texttt{tr}, \texttt{i}}}\\ &\,+\, b_{\texttt{price}}\,\log(\texttt{price}_{\texttt{i}}) \\ &\, +\, b_{\texttt{price*mm}}\,\log(\texttt{price}_{\texttt{i}})\,\times\,\color{Green} {\texttt{brand}_{\,\texttt{mm}, \texttt{i}}} \\ &\,+\, b_{\texttt{price*tr}}\,\log(\texttt{price}_{\texttt{i}})\,\times\,\color{Blue} {\texttt{brand}_{\,\texttt{tr}, \texttt{i}}} \,+\, e_{\texttt{i}} \end{align} \]

Model 3

\[ \begin{align} \log(\texttt{sales}_{\texttt{i}}) \,=\,\quad\;\;& b_{\texttt{intercept}} \,+\, \color{Green}{b_{\,\texttt{mm}}\,\texttt{brand}_{\,\texttt{mm}, \texttt{i}}} \,+\, \color{Blue}{b_{\,\texttt{tr}}\,\texttt{brand}_{\,\texttt{tr}, \texttt{i}}} \\ &\,+\; b_{\,\texttt{ad}}\,\color{Orange}{\texttt{ad}_{\,\texttt{i}}} \qquad\qquad\qquad\qquad\quad \\ &\,+\, b_{\texttt{mm*ad}}\,\color{Green} {\texttt{brand}_{\,\texttt{mm}, \texttt{i}}}\,\times\, \color{Orange}{\texttt{ad}_{\,\texttt{i}}}\,+\, b_{\texttt{tr*ad}}\,\color{Blue} {\texttt{brand}_{\,\texttt{tr}, \texttt{i}}}\,\times\, \color{Orange}{\texttt{ad}_{\,\texttt{i}}} \\ &\,+\; b_{\texttt{price}}\,\log(\texttt{price}_{\texttt{i}}) \qquad\qquad\qquad\;\;\;\;\, \\ &\,+\, b_{\texttt{price*mm}}\,\log(\texttt{price}_{\texttt{i}})\,\times\,\color{Green} {\texttt{brand}_{\,\texttt{mm}, \texttt{i}}}\qquad\qquad\qquad\;\, \\ &\,+\, b_{\texttt{price*tr}}\,\log(\texttt{price}_{\texttt{i}})\,\times\,\color{Blue} {\texttt{brand}_{\,\texttt{tr}, \texttt{i}}}\qquad\qquad\qquad\;\, \\ & \,+\, b_{\texttt{price*ad}}\,\log(\texttt{price}_{\texttt{i}})\,\times\,\color{Orange}{\texttt{ad}_{\,\texttt{i}}}\qquad\qquad\qquad\;\;\, \\ &\,+\, b_{\texttt{price*mm*ad}}\,\log(\texttt{price}_{\texttt{i}}) \,\times\,\,\color{Green} {\texttt{brand}_{\,\texttt{mm}, \texttt{i}}}\,\times\, \color{Orange}{\texttt{ad}_{\,\texttt{i}}} \\ &\,+\, b_{\texttt{price*tr*ad}}\,\log(\texttt{price}_{\texttt{i}}) \,\times\,\,\color{Blue} {\texttt{brand}_{\,\texttt{tr}, \texttt{i}}}\,\times\, \color{Orange}{\texttt{ad}_{\,\texttt{i}}} \,+\, e_{\texttt{i}} \end{align} \]

Training the Models

m1 <- lm(log(sales) ~ brand + log(price),
         data = dtrain)
m2 <- lm(log(sales) ~ brand * log(price),
         data = dtrain)
m3 <- lm(log(sales) ~ brand * ad_status * log(price),
         data = dtrain)

Regression Table with `stargazer()`

stargazer(m1, m2, m3, 
          type = "html")


	Dependent variable:

	log(sales)
	(1)	(2)	(3)

brandminute_maid	0.868^***	0.908^***	0.027
	(0.015)	(0.050)	(0.056)

brandtropicana	1.527^***	0.918^***	0.696^***
	(0.019)	(0.055)	(0.060)

ad_statusAd			1.091^***
			(0.045)

log(price)	-3.149^***	-3.396^***	-2.795^***
	(0.027)	(0.043)	(0.046)

brandminute_maid:ad_statusAd			1.200^***
			(0.097)

brandtropicana:ad_statusAd			0.753^***
			(0.118)

brandminute_maid:log(price)		0.032	0.806^***
		(0.068)	(0.074)

brandtropicana:log(price)		0.710^***	0.756^***
		(0.064)	(0.068)

ad_statusAd:log(price)			-0.511^***
			(0.087)

brandminute_maid:ad_statusAd:log(price)			-1.099^***
			(0.144)

brandtropicana:ad_statusAd:log(price)			-0.924^***
			(0.149)

Constant	10.837^***	10.967^***	10.418^***
	(0.017)	(0.025)	(0.028)


Observations	20,181	20,181	20,181
R²	0.399	0.403	0.537
Adjusted R²	0.399	0.403	0.537
Residual Std. Error	0.795 (df = 20177)	0.792 (df = 20175)	0.697 (df = 20169)
F Statistic	4,463.077^*** (df = 3; 20177)	2,726.510^*** (df = 5; 20175)	2,128.289^*** (df = 11; 20169)

Note:	p<0.1; p<0.05; p<0.01

Beta Estimates with `tidy()`

m1_betas <- tidy(m1, conf.int = T)
m2_betas <- tidy(m2, conf.int = T)
m3_betas <- tidy(m3, conf.int = T)

Prediction with `predict()`

dtest <- dtest |> 
  mutate(
    pred_1 = predict(m1, newdata = dtest),
    pred_2 = predict(m2, newdata = dtest),
    pred_3 = predict(m3, newdata = dtest)
  )

Prediction with `augment()`

m1_pred_train <- augment(m1)
m1_pred_test <- augment(m1, newdata = dtest)
m2_pred_train <- augment(m2)
m2_pred_test <- augment(m2, newdata = dtest)
m3_pred_train <- augment(m3)
m3_pred_test <- augment(m3, newdata = dtest)

Various Model Statistics with `glance()`

m1_r2 <- glance(m1)
m2_r2 <- glance(m2)
m3_r2 <- glance(m3)

Residual Plots

m1_pred_test |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot for Model 1"
  )

m2_pred_test |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot for Model 2"
  )

m3_pred_test |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot for Model 3"
  )

RMSEs

RMSEs <- dtest |> 
  mutate(
    resid_1 = log(sales) - pred_1,
    resid_2 = log(sales) - pred_2,
    resid_3 = log(sales) - pred_3
  ) |> 
  summarize(
    rmse_1 = sqrt( mean( resid_1^2 ) ),
    rmse_2 = sqrt( mean( resid_2^2 ) ),
    rmse_3 = sqrt( mean( resid_3^2 ) )
  )

RMSEs

# A tibble: 1 × 3
  rmse_1 rmse_2 rmse_3
   <dbl>  <dbl>  <dbl>
1  0.791  0.790  0.690