Classwork 2
Omitted Variable Bias; Bad Controls
Omitted Variable Bias
Code
library(tidyverse)
library(broom)
library(stargazer)
oj <- read_csv('https://bcdanl.github.io/data/dominick_oj_ad.csv')
reg_short <- lm(log(sales) ~ log(price),
data = oj)
reg_long <- lm(log(sales) ~ log(price) + brand,
data = oj)
stargazer(reg_short, reg_long,
type = 'html')| Dependent variable: | ||
| log(sales) | ||
| (1) | (2) | |
| log(price) | -1.601*** | -3.139*** |
| (0.018) | (0.023) | |
| brandminute_maid | 0.870*** | |
| (0.013) | ||
| brandtropicana | 1.530*** | |
| (0.016) | ||
| Constant | 10.423*** | 10.829*** |
| (0.015) | (0.015) | |
| Observations | 28,947 | 28,947 |
| R2 | 0.208 | 0.394 |
| Adjusted R2 | 0.208 | 0.394 |
| Residual Std. Error | 0.907 (df = 28945) | 0.794 (df = 28943) |
| F Statistic | 7,608.212*** (df = 1; 28945) | 6,275.074*** (df = 3; 28943) |
| Note: | p<0.1; p<0.05; p<0.01 | |
Code
reg_1st <- lm(log(price) ~ brand,
data = oj)
aug_m2 <- augment(reg_1st) # broom::augment()
reg_2nd <- lm(log(sales) ~ aug_m2$.resid,
data = oj)
stargazer(reg_long, reg_2nd,
type = 'html')| Dependent variable: | ||
| log(sales) | ||
| (1) | (2) | |
| log(price) | -3.139*** | |
| (0.023) | ||
| brandminute_maid | 0.870*** | |
| (0.013) | ||
| brandtropicana | 1.530*** | |
| (0.016) | ||
| .resid | -3.139*** | |
| (0.023) | ||
| Constant | 10.829*** | 9.168*** |
| (0.015) | (0.005) | |
| Observations | 28,947 | 28,947 |
| R2 | 0.394 | 0.392 |
| Adjusted R2 | 0.394 | 0.392 |
| Residual Std. Error | 0.794 (df = 28943) | 0.795 (df = 28945) |
| F Statistic | 6,275.074*** (df = 3; 28943) | 18,683.570*** (df = 1; 28945) |
| Note: | p<0.1; p<0.05; p<0.01 | |
Bad Controls
Education, Occupation, and Wages
Code
set.seed(1)
tb <- tibble(
highschool_degree = ifelse(runif(10000)>=0.5, 1,0), # highschool_degree indicator variable
ability = rnorm(10000), # e.g., talent, usually unobserved.
discrimination = highschool_degree, # highschool_degree discrimination variable
occupation_index = 1 + 2*ability + 0*highschool_degree - 2*discrimination + rnorm(10000), # true data generating process for occupation_index variable
wage = 1 - 1*discrimination + 1*occupation_index + 2*ability + rnorm(10000) # true data generating process for wage variable
)
lm_1 <- lm(wage ~ highschool_degree, tb)
lm_2 <- lm(wage ~ highschool_degree + occupation_index, tb)
lm_3 <- lm(wage ~ highschool_degree + occupation_index + ability, tb)
stargazer::stargazer(lm_1,lm_2,lm_3,
column.labels = c("Biased Unconditional",
"Biased",
"Unbiased Conditional"),
type = 'html')| Dependent variable: | |||
| wage | |||
| Biased Unconditional | Biased | Unbiased Conditional | |
| (1) | (2) | (3) | |
| highschool_degree | -3.021*** | 0.557*** | -1.039*** |
| (0.084) | (0.030) | (0.029) | |
| occupation_index | 1.789*** | 0.981*** | |
| (0.006) | (0.010) | ||
| ability | 2.028*** | ||
| (0.023) | |||
| Constant | 1.968*** | 0.201*** | 1.021*** |
| (0.059) | (0.020) | (0.018) | |
| Observations | 10,000 | 10,000 | 10,000 |
| R2 | 0.115 | 0.907 | 0.948 |
| Adjusted R2 | 0.115 | 0.907 | 0.948 |
| Residual Std. Error | 4.192 (df = 9998) | 1.357 (df = 9997) | 1.013 (df = 9996) |
| F Statistic | 1,297.822*** (df = 1; 9998) | 48,922.550*** (df = 2; 9997) | 61,140.750*** (df = 3; 9996) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
Fertilizer, Plant Height, and Crop Yield
Code
set.seed(2026)
# Simulate 10,000 plots
tb3 <- tibble(
fertilizer = rbinom(10000, 1, 0.5), # 0 = no fertilizer, 1 = fertilizer applied
soil_quality = rnorm(10000), # unobserved โplot fertilityโ
# Height (cm) is boosted by fertilizer *and* by soil quality
height = 50 + 8 * fertilizer + 5 * soil_quality + rnorm(10000),
# Yield (bushels) depends on fertilizer, height, and soil quality
yield = 200
+ 10 * fertilizer # direct effect of fertilizer
+ 3 * height # return to plant height
+ 6 * soil_quality # underlying soil fertility
+ rnorm(10000)
)
# 1) Omitting height: picks up both direct and indirect effects
lm1 <- lm(yield ~ fertilizer, data = tb3)
# 2) Controlling for height: "blocks" the mediation channel
lm2 <- lm(yield ~ fertilizer + height, data = tb3)
# 3) Also control soil_quality to recover the true direct effect
lm3 <- lm(yield ~ fertilizer + height + soil_quality, data = tb3)
stargazer(
lm1, lm2, lm3,
column.labels = c("Omit Height (Biased)",
"Control Height (More Biased)",
"Also Soil (Unbiased)"),
dep.var.labels = "yield",
keep = c("fertilizer", "height", "soil_quality"),
covariate.labels= c("Fertilizer", "Plant Height", "Soil Quality"),
type = "html"
)| Dependent variable: | |||
| yield | |||
| Omit Height (Biased) | Control Height (More Biased) | Also Soil (Unbiased) | |
| (1) | (2) | (3) | |
| Fertilizer | 34.657*** | 0.789*** | 10.025*** |
| (0.426) | (0.040) | (0.081) | |
| Plant Height | 4.156*** | 2.999*** | |
| (0.003) | (0.010) | ||
| Soil Quality | 6.002*** | ||
| (0.050) | |||
| Observations | 10,000 | 10,000 | 10,000 |
| R2 | 0.398 | 0.997 | 0.999 |
| Adjusted R2 | 0.398 | 0.997 | 0.999 |
| Residual Std. Error | 21.323 (df = 9998) | 1.551 (df = 9997) | 0.996 (df = 9996) |
| F Statistic | 6,604.218*** (df = 1; 9998) | 1,563,217.000*** (df = 2; 9997) | 2,532,263.000*** (df = 3; 9996) |
| Note: | p<0.1; p<0.05; p<0.01 | ||