Classwork 2

Omitted Variable Bias; Bad Controls

Author

Byeong-Hak Choe

Published

January 26, 2026

Modified

January 26, 2026

Omitted Variable Bias

Code
library(tidyverse)
library(broom)
library(stargazer)

oj <- read_csv('https://bcdanl.github.io/data/dominick_oj_ad.csv')

reg_short <- lm(log(sales) ~ log(price),
                data = oj)

reg_long <- lm(log(sales) ~ log(price) + brand,
                data = oj)

stargazer(reg_short, reg_long,
          type = 'html')
Dependent variable:
log(sales)
(1) (2)
log(price) -1.601*** -3.139***
(0.018) (0.023)
brandminute_maid 0.870***
(0.013)
brandtropicana 1.530***
(0.016)
Constant 10.423*** 10.829***
(0.015) (0.015)
Observations 28,947 28,947
R2 0.208 0.394
Adjusted R2 0.208 0.394
Residual Std. Error 0.907 (df = 28945) 0.794 (df = 28943)
F Statistic 7,608.212*** (df = 1; 28945) 6,275.074*** (df = 3; 28943)
Note: p<0.1; p<0.05; p<0.01
Code
reg_1st <- lm(log(price) ~ brand,
              data = oj)

aug_m2 <- augment(reg_1st)    # broom::augment()

reg_2nd <- lm(log(sales) ~ aug_m2$.resid,
              data = oj)

stargazer(reg_long, reg_2nd,
          type = 'html')
Dependent variable:
log(sales)
(1) (2)
log(price) -3.139***
(0.023)
brandminute_maid 0.870***
(0.013)
brandtropicana 1.530***
(0.016)
.resid -3.139***
(0.023)
Constant 10.829*** 9.168***
(0.015) (0.005)
Observations 28,947 28,947
R2 0.394 0.392
Adjusted R2 0.394 0.392
Residual Std. Error 0.794 (df = 28943) 0.795 (df = 28945)
F Statistic 6,275.074*** (df = 3; 28943) 18,683.570*** (df = 1; 28945)
Note: p<0.1; p<0.05; p<0.01

Bad Controls

Education, Occupation, and Wages

Code
set.seed(1)

tb <- tibble( 
  highschool_degree = ifelse(runif(10000)>=0.5, 1,0), # highschool_degree indicator variable
  ability = rnorm(10000), # e.g., talent, usually unobserved.
  discrimination = highschool_degree, # highschool_degree discrimination variable
  occupation_index = 1 + 2*ability + 0*highschool_degree - 2*discrimination + rnorm(10000), # true data generating process for occupation_index variable
  wage = 1 - 1*discrimination + 1*occupation_index + 2*ability + rnorm(10000) # true data generating process for wage variable
)

lm_1 <- lm(wage ~ highschool_degree, tb)
lm_2 <- lm(wage ~ highschool_degree + occupation_index, tb)
lm_3 <- lm(wage ~ highschool_degree + occupation_index + ability, tb)

stargazer::stargazer(lm_1,lm_2,lm_3, 
                     column.labels = c("Biased Unconditional", 
                                       "Biased",
                                       "Unbiased Conditional"),
                     type = 'html')
Dependent variable:
wage
Biased Unconditional Biased Unbiased Conditional
(1) (2) (3)
highschool_degree -3.021*** 0.557*** -1.039***
(0.084) (0.030) (0.029)
occupation_index 1.789*** 0.981***
(0.006) (0.010)
ability 2.028***
(0.023)
Constant 1.968*** 0.201*** 1.021***
(0.059) (0.020) (0.018)
Observations 10,000 10,000 10,000
R2 0.115 0.907 0.948
Adjusted R2 0.115 0.907 0.948
Residual Std. Error 4.192 (df = 9998) 1.357 (df = 9997) 1.013 (df = 9996)
F Statistic 1,297.822*** (df = 1; 9998) 48,922.550*** (df = 2; 9997) 61,140.750*** (df = 3; 9996)
Note: p<0.1; p<0.05; p<0.01

Fertilizer, Plant Height, and Crop Yield

Code
set.seed(2026)

# Simulate 10,000 plots
tb3 <- tibble(
  fertilizer    = rbinom(10000, 1, 0.5),       # 0 = no fertilizer, 1 = fertilizer applied
  soil_quality  = rnorm(10000),                # unobserved โ€œplot fertilityโ€
  # Height (cm) is boosted by fertilizer *and* by soil quality
  height        = 50 + 8 * fertilizer + 5 * soil_quality + rnorm(10000),
  # Yield (bushels) depends on fertilizer, height, and soil quality
  yield         = 200 
  + 10 * fertilizer      # direct effect of fertilizer
  + 3  * height          # return to plant height
  + 6  * soil_quality    # underlying soil fertility
  + rnorm(10000)
)

# 1) Omitting height: picks up both direct and indirect effects
lm1 <- lm(yield ~ fertilizer,         data = tb3)
# 2) Controlling for height: "blocks" the mediation channel
lm2 <- lm(yield ~ fertilizer + height, data = tb3)
# 3) Also control soil_quality to recover the true direct effect
lm3 <- lm(yield ~ fertilizer + height + soil_quality, data = tb3)

stargazer(
  lm1, lm2, lm3,
  column.labels   = c("Omit Height (Biased)", 
                      "Control Height (More Biased)",
                      "Also Soil (Unbiased)"),
  dep.var.labels  = "yield",
  keep            = c("fertilizer", "height", "soil_quality"),
  covariate.labels= c("Fertilizer", "Plant Height", "Soil Quality"),
  type            = "html"
)
Dependent variable:
yield
Omit Height (Biased) Control Height (More Biased) Also Soil (Unbiased)
(1) (2) (3)
Fertilizer 34.657*** 0.789*** 10.025***
(0.426) (0.040) (0.081)
Plant Height 4.156*** 2.999***
(0.003) (0.010)
Soil Quality 6.002***
(0.050)
Observations 10,000 10,000 10,000
R2 0.398 0.997 0.999
Adjusted R2 0.398 0.997 0.999
Residual Std. Error 21.323 (df = 9998) 1.551 (df = 9997) 0.996 (df = 9996)
F Statistic 6,604.218*** (df = 1; 9998) 1,563,217.000*** (df = 2; 9997) 2,532,263.000*** (df = 3; 9996)
Note: p<0.1; p<0.05; p<0.01
Back to top