Classwork 3

Panel Data Models

Author

Byeong-Hak Choe

Published

January 28, 2026

Modified

February 5, 2026

library(tidyverse)
library(plm)
library(lmtest)
library(sandwich)
library(stargazer)

co2 <- read_csv("https://bcdanl.github.io/data/owid-co2-data.csv")
codebook <- read_csv("https://bcdanl.github.io/data/owid-co2-codebook.csv")

Build a clean country-year panel

df <- co2 |>
  # Keep only country observations (drop regions / aggregates)
  # OWID country entries typically have a 3-letter ISO code
  filter(!is.na(iso_code), nchar(iso_code) == 3) |>
  # Keep useful sample years (adjust as you want)
  filter(year >= 1990) |>
  # Select a simple EKC-style set of variables
  mutate(
    gdp_per_capita = gdp / population, 
    .after = gdp
  ) |> 
  select(
    iso_code, country, year,
    co2_per_capita,
    gdp_per_capita,
    population,
    energy_per_capita,         # optional control
    primary_energy_consumption # optional control (can be NA a lot)
  ) |>
  # Drop missing core variables
  filter(!is.na(co2_per_capita), !is.na(gdp_per_capita)) |>
  # Create logs and squared log GDP per capita
  mutate(
    log_co2pc = log(co2_per_capita),
    log_gdppc = log(gdp_per_capita),
    log_gdppc2 = log_gdppc^2,
    log_pop = log(population),
    log_energy_pc = ifelse(!is.na(energy_per_capita) & energy_per_capita > 0,
                          log(energy_per_capita), 
                          NA)
  ) |>
  # Keep rows with finite log outcomes/predictors
  filter(is.finite(log_co2pc), 
         is.finite(log_gdppc), 
         is.finite(log_gdppc2), 
         is.finite(log_pop))

# Convert to pdata.frame (panel format for plm)
df_panel <- pdata.frame(df, index = c("iso_code", "year"))

Specify models

# Baseline (no optional controls)
f_base <- log_co2pc ~ log_gdppc + log_gdppc2

# Extended model (adds energy per capita; will drop rows with missing log_energy_pc)
f_ext  <- log_co2pc ~ log_gdppc + log_gdppc2 + log_energy_pc

# --- Models (BASE) ---
m1_pool <- plm(f_base, data = df_panel, model = "pooling")                 # pooled OLS
m2_fe   <- plm(f_base, data = df_panel, model = "within", effect = "individual")  # country FE
m3_twfe <- plm(f_base, data = df_panel, model = "within", effect = "twoways")     # country FE + year FE
m4_re   <- plm(f_base, data = df_panel, model = "random", effect = "individual") # random effects

# --- Models (EXTENDED) ---
# (Optional) include energy per capita as a control
m1_pool_ext <- plm(f_ext, data = df_panel, model = "pooling")
m2_fe_ext   <- plm(f_ext, data = df_panel, model = "within", effect = "individual")
m3_twfe_ext <- plm(f_ext, data = df_panel, model = "within", effect = "twoways")
m4_re_ext   <- plm(f_ext, data = df_panel, model = "random", effect = "individual")

Robust / clustered standard errors

# Function to compute clustered SE (country-clustered)
se_cluster_country_safe <- function(model) {
  V <- vcovHC(model, type = "HC1", cluster = "group")
  se <- sqrt(diag(V))
  se[is.na(se)] <- 0              # βœ… replace NA SEs so stargazer won't crash
  return(se)
}

# Clustered SEs for BASE models
se_m1 <- se_cluster_country_safe(m1_pool)
se_m2 <- se_cluster_country_safe(m2_fe)
se_m3 <- se_cluster_country_safe(m3_twfe)
se_m4 <- se_cluster_country_safe(m4_re)

# Clustered SEs for EXTENDED models
se_m1e <- se_cluster_country_safe(m1_pool_ext)
se_m2e <- se_cluster_country_safe(m2_fe_ext)
se_m3e <- se_cluster_country_safe(m3_twfe_ext)
se_m4e <- se_cluster_country_safe(m4_re_ext)

Panel diagnostics

# FE vs pooled OLS (is panel structure important?)
pFtest(m2_fe, m1_pool)

    F test for individual effects

data:  f_base
F = 193.79, df1 = 163, df2 = 5243, p-value < 2.2e-16
alternative hypothesis: significant effects
# RE vs pooled OLS (Breusch-Pagan LM test)
plmtest(m1_pool, type = "bp")

    Lagrange Multiplier Test - (Breusch-Pagan)

data:  f_base
chisq = 42925, df = 1, p-value < 2.2e-16
alternative hypothesis: significant effects
# FE vs RE (Hausman test)
phtest(m2_fe, m4_re)

    Hausman Test

data:  f_base
chisq = 9819.7, df = 2, p-value < 2.2e-16
alternative hypothesis: one model is inconsistent

Results

# --- Table A: BASE models ---
# Table (omit intercept so pooled/re match FE models cleanly)
stargazer(
  m1_pool, m2_fe, m3_twfe, m4_re,
  type = "html",
  se = list(se_m1, se_m2, se_m3, se_m4),
  title = "Panel Models (OWID CO2): Pooled OLS vs FE vs TWFE vs RE",
  column.labels = c("Pooled OLS", "Country FE", "Two-way FE", "Random Effects"),
  dep.var.labels = "log(CO2 per capita)",
  omit = "Intercept",                      # βœ… avoids intercept mismatch across models
  omit.stat = c("f", "ser"),
  digits = 3,
  notes = c("Standard errors clustered by country (iso_code).",
            "Country FE controls for time-invariant differences across countries.",
            "Two-way FE adds year fixed effects to absorb global shocks.")
)
Panel Models (OWID CO2): Pooled OLS vs FE vs TWFE vs RE
Dependent variable:
log(CO2 per capita)
Pooled OLS Country FE Two-way FE Random Effects
(1) (2) (3) (4)
log_gdppc 4.009*** 3.440*** 3.453*** 3.539***
(0.470) (0.444) (0.445) (0.437)
log_gdppc2 -0.157*** -0.168*** -0.167*** -0.170***
(0.027) (0.023) (0.023) (0.023)
Constant -22.396*** -17.176***
(2.057) (2.055)
Observations 5,409 5,409 5,409 5,409
R2 0.837 0.357 0.281 0.386
Adjusted R2 0.837 0.337 0.254 0.386
Note: p<0.1; p<0.05; p<0.01
Standard errors clustered by country (iso
Country FE controls for time-invariant differences across countries.
Two-way FE adds year fixed effects to absorb global shocks.
# --- Table B: EXTENDED models (adds energy control) ---
stargazer(
  m1_pool_ext, m2_fe_ext, m3_twfe_ext, m4_re_ext,
  type = "html",
  se = list(se_m1, se_m2, se_m3, se_m4),
  title = "Panel Models (OWID CO2): With Energy Control",
  column.labels = c("Pooled OLS", "Country FE", "Two-way FE", "Random Effects"),
  dep.var.labels = "log(CO2 per capita)",
  omit = "Intercept",                      # βœ… avoids intercept mismatch across models
  omit.stat = c("f", "ser"),
  digits = 3,
  notes = c("Standard errors clustered by country (iso_code).",
            "Adding energy per capita often reduces the GDP effect because energy use is a major driver of CO2.")
)
Panel Models (OWID CO2): With Energy Control
Dependent variable:
log(CO2 per capita)
Pooled OLS Country FE Two-way FE Random Effects
(1) (2) (3) (4)
log_gdppc 1.378*** 1.903*** 1.919*** 1.832***
(0.470) (0.444) (0.445) (0.437)
log_gdppc2 -0.072*** -0.100*** -0.099*** -0.095***
(0.027) (0.023) (0.023) (0.023)
log_energy_pc 0.869 0.621 0.621 0.672
Constant -13.726*** -14.103***
(2.057) (2.055)
Observations 5,358 5,358 5,358 5,358
R2 0.948 0.628 0.582 0.695
Adjusted R2 0.948 0.616 0.566 0.695
Note: p<0.1; p<0.05; p<0.01
Standard errors clustered by country (iso
Adding energy per capita often reduces the GDP effect because energy use is a major driver of CO2.

lm()

# ----------------------------
# Estimate with lm() (LSDV FE and TWFE)
#    IMPORTANT: use the same sample as plm EXTENDED model
#    plm drops rows with NA in log_energy_pc; match that here.
# ----------------------------
df_lm <- as.data.frame(df_panel) |>
  filter(!is.na(log_energy_pc))

m_pool_lm_ext <- lm(
  log_co2pc ~ log_gdppc + log_gdppc2 + log_energy_pc,
  data = df_lm
)

m_fe_lm_ext <- lm(
  log_co2pc ~ log_gdppc + log_gdppc2 + log_energy_pc + factor(iso_code),
  data = df_lm
)

m_twfe_lm_ext <- lm(
  log_co2pc ~ log_gdppc + log_gdppc2 + log_energy_pc + factor(iso_code) + factor(year),
  data = df_lm
)

# ----------------------------
# 6) Clustered SEs for lm() (country clustered)
# ----------------------------
se_pool_lm <- sqrt(diag(vcovCL(m_pool_lm_ext, cluster = ~ iso_code, type = "HC1")))
se_fe_lm   <- sqrt(diag(vcovCL(m_fe_lm_ext,   cluster = ~ iso_code, type = "HC1")))
se_twfe_lm <- sqrt(diag(vcovCL(m_twfe_lm_ext, cluster = ~ iso_code, type = "HC1")))

# ----------------------------
# 7) Stargazer tables
#    lm() table
# ----------------------------


# lm(): Pooled vs (dummy) FE vs (dummy) TWFE
stargazer(
  m_pool_lm_ext, m_fe_lm_ext, m_twfe_lm_ext,
  type = "html",
  se = list(se_pool_lm, se_fe_lm, se_twfe_lm),
  title = "LM (LSDV) (OWID CO2): With Energy Control",
  column.labels = c("Pooled OLS", "Country FE (dummies)", "Two-way FE (dummies)"),
  dep.var.labels = "log(CO2 per capita)",
  keep = c("log_gdppc", "log_gdppc2", "log_energy_pc", "(Intercept)"),
  digits = 3,
  omit.stat = c("f", "ser", "rsq", "adj.rsq"),
  notes = c(
    "Standard errors are HC1 and clustered by country (iso_code).",
    "LM FE/TWFE report a Constant due to dummy coding; it depends on the omitted baseline country/year."
  )
)
LM (LSDV) (OWID CO2): With Energy Control
Dependent variable:
log(CO2 per capita)
Pooled OLS Country FE (dummies) Two-way FE (dummies)
(1) (2) (3)
log_gdppc 1.378*** 1.903*** 1.919***
(0.329) (0.361) (0.365)
log_gdppc2 -0.072*** -0.100*** -0.099***
(0.017) (0.019) (0.019)
log_energy_pc 0.869*** 0.621*** 0.621***
(0.046) (0.057) (0.058)
Observations 5,358 5,358 5,358
Note: p<0.1; p<0.05; p<0.01
Standard errors are HC1 and clustered by country (iso
LM FE/TWFE report a Constant due to dummy coding; it depends on the omitted baseline country/year.

Empirical EKC

# EKC shape + 95% band (country-clustered), plotted over x in [0, 20], y in [0, 13]

# 0) Cluster-robust vcov
V <- vcovHC(m3_twfe_ext, type = "HC1", cluster = "group")

# 1) Coefs (GDP terms only)
b <- coef(m3_twfe_ext)
b1 <- unname(b["log_gdppc"])
b2 <- unname(b["log_gdppc2"])

# 2) Grid over requested range
ekc_shape_df <- tibble(
  x = seq(0, 20, length.out = 400)
) |>
  mutate(ekc_shape = b1 * x + b2 * x^2)

# 3) Delta-method SE for m(x) = b1*x + b2*x^2
V_sub <- V[c("log_gdppc", "log_gdppc2"), c("log_gdppc", "log_gdppc2")]

ekc_shape_df <- ekc_shape_df |>
  rowwise() |>
  mutate(
    se = sqrt(c(x, x^2) %*% V_sub %*% c(x, x^2)),
    lo = ekc_shape - 1.96 * se,
    hi = ekc_shape + 1.96 * se
  ) |>
  ungroup()

# 4) Turning point
x_star <- -b1 / (2 * b2)
gdppc_star <- exp(x_star)

# 5) Plot
ggplot(ekc_shape_df, aes(x = x, y = ekc_shape)) +
  geom_ribbon(aes(ymin = lo, ymax = hi), alpha = 0.2) +
  geom_line(linewidth = 1.1) +
  geom_vline(xintercept = x_star, linetype = "dashed") +
  coord_cartesian(xlim = c(0, 20), ylim = c(0, 13)) +
  labs(
    title = "Empirical EKC Shape from Two-way FE (Extended Model)",
    subtitle = paste0(
      "Curve: Ξ²Μ‚1 x + Ξ²Μ‚2 xΒ² (x = log GDP per capita). ",
      "95% CI via delta method (country-clustered). ",
      "\nTurning point at x* = ", round(x_star, 3),
      " (GDPpc* β‰ˆ ", scales::comma(round(gdppc_star, 0)), ")."
    ),
    x = "log(GDP per capita)",
    y = expression(widehat(log(CO[2]))~~"(per capita)")
  )

Back to top