Regularized Logistic Regression

Car Safety Inspection

Author
Affiliation

Byeong-Hak Choe

SUNY Geneseo

Published

March 2, 2026

Modified

March 3, 2026

R Packages and Settings

library(tidyverse)
library(broom)
library(stargazer)
library(margins)
library(yardstick)
library(WVPlots)
library(pROC)
library(glmnet)
library(gamlr)

library(DT)
library(rmarkdown)
library(hrbrthemes)
library(ggthemes)

theme_set(
  theme_ipsum()
)

scale_colour_discrete <- function(...) scale_color_colorblind(...)
scale_fill_discrete <- function(...) scale_fill_colorblind(...)

Data

df <- read_csv('https://bcdanl.github.io/data/car-data.csv')

df |> 
  rmarkdown::paged_table()

Variable description

Variable Description
buying Buying price of the car (vhigh, high, med, low)
maint Maintenance cost (vhigh, high, med, low)
doors Number of doors (2, 3, 4, 5more)
persons Capacity in terms of persons to carry (2, 4, more)
lug_boot Size of luggage boot (small, med, big)
safety Estimated safety of the car (low, med, high)
rating Car acceptability (unacc, acc, good, vgood)
fail TRUE if the car is unacceptable (unacc), otherwise FALSE

Quick check

df |> 
  count(buying)
# A tibble: 4 × 2
  buying     n
  <chr>  <int>
1 high     432
2 low      432
3 med      432
4 vhigh    432
df |> 
  count(maint)
# A tibble: 4 × 2
  maint     n
  <chr> <int>
1 high    432
2 low     432
3 med     432
4 vhigh   432
df |> 
  count(doors)
# A tibble: 4 × 2
  doors     n
  <chr> <int>
1 2       432
2 3       432
3 4       432
4 5more   432
df |> 
  count(persons)
# A tibble: 3 × 2
  persons     n
  <chr>   <int>
1 2         576
2 4         576
3 more      576
df |> 
  count(lug_boot)
# A tibble: 3 × 2
  lug_boot     n
  <chr>    <int>
1 big        576
2 med        576
3 small      576
df |> 
  count(safety)
# A tibble: 3 × 2
  safety     n
  <chr>  <int>
1 high     576
2 low      576
3 med      576

Model

\[ \begin{align} &\quad\;\; \text{Prob}(\text{fail}_{i} = 1) \\ &= G\Big(\beta_{0} \\ &\qquad\quad\;\;\; \,+\, \beta_{4} \text{buying\_med}_{i} \,+\, \beta_{4} \text{buying\_high}_{i} \,+\, \beta_{4} \text{buying\_vhigh}_{i} \\ &\qquad\quad\;\;\; \,+\, \beta_{4} \text{maint\_med}_{i} \,+\, \beta_{4} \text{maint\_high}_{i} \,+\, \beta_{4} \text{maint\_vhigh}_{i} \\ &\qquad\quad\;\;\; \,+\, \beta_{7} \text{persons\_4}_{i} \,+\, \beta_{8} \text{persons\_more}_{i} \\ &\qquad\quad\;\;\; \,+\, \beta_{10} \text{lug\_boot\_med}_{i}\,+\, \beta_{10} \text{lug\_boot\_big}_{i} \\ &\qquad\quad\;\;\; \,+\, \beta_{11} \text{safety\_med}_{i}\,+\, \beta_{11} \text{safety\_high}_{i} \Big), \end{align} \]

where \(G(\,\cdot\,)\) is

\[ G(\,\cdot\,) = \frac{\exp(\,\cdot\,)}{1 + \exp(\,\cdot\,)}. \]

Pre-process & Train/Test Split (runif(n()))

This avoids predict() errors like “factor has new levels …”.

set.seed(14454)
df <- df |> 
  mutate(rnd = runif(n())) |> 
  mutate(buying = factor(buying),
         buying = fct_relevel(buying, "low"),
         maint = factor(maint),
         maint = fct_relevel(maint, "low"),
         persons = factor(persons),
         persons = fct_relevel(persons, "2"),
         lug_boot = factor(lug_boot),
         lug_boot = fct_relevel(lug_boot, "small"),
         safety = factor(safety),
         safety = fct_relevel(safety, "low"))

dtrain <- df |> 
  filter(rnd > .3)
dtest <- df |> 
  filter(rnd <= .3)

Logistic Regression

Regression Table (stargazer)

stargazer(
  model,
  type = "html",
  digits = 3,
  title = "Logistic regression (logit): Car Safety"
)
Logistic regression (logit): Car Safety
Dependent variable:
fail
buyinghigh 4.606***
(0.606)
buyingmed 0.680
(0.548)
buyingvhigh 6.151***
(0.696)
mainthigh 3.220***
(0.526)
maintmed 0.343
(0.487)
maintvhigh 5.685***
(0.666)
persons4 -27.530
(948.137)
personsmore -27.177
(948.137)
lug_bootbig -3.684***
(0.476)
lug_bootmed -2.415***
(0.415)
safetyhigh -28.026
(959.275)
safetymed -25.413
(959.275)
Constant 48.862
(1,348.767)
Observations 1,231
Log Likelihood -139.963
Akaike Inf. Crit. 305.926
Note: p<0.1; p<0.05; p<0.01
  • Regression table with summary()
summary(model)

Call:
glm(formula = fmla, family = binomial(link = "logit"), data = dtrain)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)   48.8619  1348.7670   0.036    0.971    
buyinghigh     4.6064     0.6058   7.604 2.88e-14 ***
buyingmed      0.6797     0.5484   1.239    0.215    
buyingvhigh    6.1514     0.6965   8.832  < 2e-16 ***
mainthigh      3.2203     0.5259   6.124 9.15e-10 ***
maintmed       0.3432     0.4867   0.705    0.481    
maintvhigh     5.6849     0.6661   8.535  < 2e-16 ***
persons4     -27.5300   948.1374  -0.029    0.977    
personsmore  -27.1767   948.1374  -0.029    0.977    
lug_bootbig   -3.6837     0.4757  -7.744 9.66e-15 ***
lug_bootmed   -2.4148     0.4155  -5.812 6.17e-09 ***
safetyhigh   -28.0256   959.2748  -0.029    0.977    
safetymed    -25.4128   959.2746  -0.026    0.979    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1477.11  on 1230  degrees of freedom
Residual deviance:  279.93  on 1218  degrees of freedom
AIC: 305.93

Number of Fisher Scoring iterations: 20

Beta Estimates (tidy())

model_betas <- tidy(model, 
                    conf.int = T)  # conf.level = 0.95 (default)

model_betas_90ci <- tidy(model, 
                       conf.int = T,
                       conf.level = 0.90)

model_betas_99ci <- tidy(model, 
                       conf.int = T,
                       conf.level = 0.99)

Coefficient Plots

month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))

ggplot(
  data = month_ci |> 
    filter(!str_detect(term, "Intercept")),
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    color = "CI level",
    y = ""
  ) 

Model Fit (glance())

model |> 
  glance() |> 
  paged_table()

Marginal Effects (margins())

Average Marginal Effects

me <- margins(model)
sum_me <- summary(me)
sum_me
      factor     AME     SE        z      p   lower   upper
  buyinghigh  0.1619 0.0147  11.0250 0.0000  0.1332  0.1907
   buyingmed  0.0155 0.0123   1.2599 0.2077 -0.0086  0.0396
 buyingvhigh  0.2319 0.0144  16.1105 0.0000  0.2037  0.2601
 lug_bootbig -0.1323 0.0123 -10.7690 0.0000 -0.1564 -0.1083
 lug_bootmed -0.0915 0.0133  -6.9038 0.0000 -0.1175 -0.0655
   mainthigh  0.1103 0.0147   7.4809 0.0000  0.0814  0.1392
    maintmed  0.0096 0.0136   0.7097 0.4779 -0.0170  0.0362
  maintvhigh  0.2083 0.0148  14.1185 0.0000  0.1793  0.2372
    persons4 -0.4467 0.0110 -40.4494 0.0000 -0.4683 -0.4251
 personsmore -0.4280 0.0116 -36.8429 0.0000 -0.4508 -0.4052
  safetyhigh -0.5130 0.0110 -46.4676 0.0000 -0.5347 -0.4914
   safetymed -0.3683 0.0120 -30.5950 0.0000 -0.3919 -0.3447


Marginal Effect Plots

df_ame <- sum_me |>
  as_tibble() |>
  rename(
    term = factor,
    ame  = AME,
    se   = SE
  )

# Pair each level with the correct z (NOT a Cartesian product)
ci_levels <- tibble(
  level = c("90%", "95%", "99%"),
  zcrit = c(1.645, 1.96, 2.576)
)

df_ame_ci <- df_ame |>
  tidyr::crossing(level = ci_levels$level) |>
  left_join(ci_levels, by = "level") |>
  mutate(
    conf.low  = ame - zcrit * se,
    conf.high = ame + zcrit * se
  )

ggplot(df_ame_ci,
       aes(x = fct_reorder(term, ame), y = ame)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_pointrange(
    aes(ymin = conf.low, ymax = conf.high, color = level),
    position = position_dodge(width = 0.6)
  ) +
  coord_flip() +
  labs(
    x = NULL,
    y = "Average marginal effect (AME)",
    color = "CI level",
    title = "Average Marginal Effects (90/95/99% CIs)"
  )

Classification

Double density plot (threshold intuition)

threshold <- 0.5

model |> 
  augment(type.predict = "response") |> 
  ggplot(aes(x = .fitted, 
             fill = factor(fail))) +
  geom_density(alpha = 0.35) +
  geom_vline(xintercept = threshold, linetype = "dashed") +
  labs(
    x = "Predicted probability",
    y = "Density",
    fill = "Actual class",
    title = "Training set: predicted probabilities by actual class"
  )

Confusion Matrix

# threshold <- mean(dtest$fail)
threshold <- 0.5

df_cm <- model |>
  augment(newdata = dtest,
          type.predict = "response") |> 
  mutate(
    actual = factor(fail, 
                    levels = c(0, 1), 
                    labels = c("pass", "fail")),
    pred   = factor(if_else(.fitted > threshold, 1, 0),
                    levels = c(0, 1), 
                    labels = c("pass", "fail"))
  )

conf_mat <- table(truth = df_cm$actual,
                  prediction = df_cm$pred)
conf_mat
      prediction
truth  pass fail
  pass  157    7
  fail   17  316
# data.frame of confusion matrix
cm_counts <- df_cm |>
  group_by(actual, pred) |>
  summarize(n = n()) |>
  ungroup() |> 
  pivot_wider(names_from = pred, 
              values_from = n, 
              values_fill = 0) |> 
  rename(`truth / prediction` = actual)

cm_counts
# A tibble: 2 × 3
  `truth / prediction`  pass  fail
  <fct>                <int> <int>
1 pass                   157     7
2 fail                    17   316

Performance Metrics (from conf_mat)

base_rate   <- mean(dtest$fail)

accuracy <- (conf_mat[1,1] + conf_mat[2,2]) / sum(conf_mat) 
precision <- conf_mat[2,2] / sum(conf_mat[,2]) 
recall <- conf_mat[2,2] / sum(conf_mat[2,]) 
specificity <- conf_mat[1,1] / sum(conf_mat[1,]) 
enrichment <- precision / base_rate


df_class_metric <- 
  data.frame(
    metric = c("Base rate", 
               "Accuracy", 
               "Precision", 
               "Recall", 
               "Specificity", 
               "Enrichment"),
    value  = c(base_rate, 
               accuracy, 
               precision, 
               recall, 
               specificity, 
               enrichment)
    )

df_class_metric |> 
  mutate(value = round(value, 4)) |> 
  rmarkdown::paged_table()

Precision/Recall/Enrichment Curves over Thresholds (using WVPlots package)

plt <- PRTPlot(df_cm, 
               ".fitted", "fail", 1,
               plotvars = c("enrichment", "precision", "recall", "specificity", "false_positive_rate"),
               thresholdrange = c(.4,.8),
               title = "With threshold for car safety model")

plt + 
  geom_vline(xintercept = threshold, 
             color="maroon", 
             linetype = 2)

ROC and AUC (using WVPlots and pROC packages)

roc <- ROCPlot(df_cm,
               xvar = '.fitted',
               truthVar = 'fail',
               truthTarget = 1,
               title = 'Classifier performance')

# ROC with vertical line
roc + 
  geom_vline(xintercept = 1 - specificity, 
             color="maroon", linetype = 2)

# AUC
# pROC::roc(BINARY_OUTCOME, PREDICTED_PROBABILITY)
roc_obj <- roc(df_cm$fail, df_cm$.fitted)
auc(roc_obj)
Area under the curve: 0.9896




Regularized Logistic Regression

Matrices

# fmla <- fail ~ buying + maint + persons + lug_boot + safety

# y must be numeric 0/1 (FALSE->0, TRUE->1)
y_train <- as.integer(dtrain$fail)   # fail is TRUE
y_test  <- as.integer(dtest$fail)

# x must be a numeric matrix
  # [, -1] to remove an intercept variable
X_train <- model.matrix(~ buying + maint + persons + lug_boot + safety, data = dtrain)[, -1]
X_test  <- model.matrix(~ buying + maint + persons + lug_boot + safety, data = dtest)[, -1]
  • model.matrix() drops a reference category for each categorical variable in the formula by default.

Fit with cv.glmnet(family = "binomial")

Lasso (alpha = 1)

set.seed(1)
model_lasso <- cv.glmnet(
  X_train, y_train,
  alpha = 1,
  family = "binomial"
)

Ridge (alpha = 0)

set.seed(1)
model_ridge <- cv.glmnet(
  X_train, y_train,
  alpha = 0,
  family = "binomial"
)

Elastic Net (\(0 < \alpha < 1\))

  • Loop over alpha values, pick best by min CV deviance:
set.seed(1)

alpha_grid <- seq(0.1, 0.9, by = 0.1)   # 0 < alpha < 1 for elastic net

# for-loop: initial setup
cv_list <- vector("list", length(alpha_grid))
cvm_min <- rep(NA, length(alpha_grid))

for (i in seq_along(alpha_grid)) {
  a <- alpha_grid[i]

  cv_fit <- cv.glmnet(
    X_train, y_train,
    alpha = a,
    family = "binomial"
  )

  cv_list[[i]] <- cv_fit
  cvm_min[i] <- min(cv_fit$cvm)
}

best_i <- which.min(cvm_min)

best_alpha <- alpha_grid[best_i]
model_enet <- cv_list[[best_i]]

best_alpha
[1] 0.3
model_enet$lambda.min
[1] 5.463574e-05
model_enet$lambda.1se
[1] 0.001072522

\(\lambda\) & CV errors (deviances) with tidy()

CVErrors_lasso <- tidy(model_lasso)
CVErrors_ridge <- tidy(model_ridge)
CVErrors_enet <- tidy(model_enet)
CVErrors_lasso |> 
  paged_table()
CVErrors_ridge |> 
  paged_table()
CVErrors_enet |> 
  paged_table()
  • lambda: The penalty value \(\,\lambda\,\) used for that row.

    • Lasso: Larger \(\,\lambda\,\) means stronger shrinkage, so more coefficients become \(0\).
  • estimate: The mean cross-validated error/loss at that \(\,\lambda\,\) (the average out-of-sample performance across folds).

    • For family = "binomial", this is typically mean CV deviance unless you changed type.measure.
  • std.error: The standard error of the cross-validated estimate at that \(\,\lambda\,\) across folds.

  • conf.low: The lower bound of the confidence interval for the CV estimate at that \(\,\lambda\,\) (based on conf.level).

  • conf.high: The upper bound of the confidence interval for the CV estimate at that \(\,\lambda\,\) (based on conf.level).

  • nzero: The number of nonzero coefficients at that \(\,\lambda\,\) (how many predictors the lasso keeps; coefficients shrunk to \(0\) are excluded).

\(\lambda_{\text{min}}\) & \(\lambda_{\text{1se}}\) with glance()

lambdas_lasso <- model_lasso |> 
  glance() 

lambdas_ridge <- model_ridge |> 
  glance() 

lambdas_enet <- model_enet |> 
  glance() 
lambdas_lasso |> 
  paged_table()
lambdas_ridge |> 
  paged_table()
lambdas_enet |> 
  paged_table()

Beta Coefficients with coef()

# Logistic
beta <- coef(model)

# Lasso
beta_lasso_1se <- coef(model_lasso, 
                       s = "lambda.1se")  # default
beta_lasso_min <- coef(model_lasso, 
                       s = "lambda.min")

# Ridge
beta_ridge_1se <- coef(model_ridge)  # default: "lambda.1se"
beta_ridge_min <- coef(model_ridge, 
                       s = "lambda.min")

# Elastic net
beta_enet_1se <- coef(model_enet)  # default: "lambda.1se"
beta_enet_min <- coef(model_enet, 
                      s = "lambda.min")

betas <- data.frame(
  term = rownames(beta_lasso_1se),
  beta = as.numeric(beta),
  beta_lasso_1se = as.numeric(beta_lasso_1se),
  beta_lasso_min = as.numeric(beta_lasso_min),
  beta_ridge_1se = as.numeric(beta_ridge_1se),
  beta_ridge_min = as.numeric(beta_ridge_min),
  beta_enet_1se = as.numeric(beta_enet_1se),
  beta_enet_min = as.numeric(beta_enet_min)
)

# Show only nonzero in either solution
betas_lasso_nz <- betas |>
  filter(beta_lasso_1se != 0 | beta_lasso_min != 0) |>
  arrange(abs(beta_lasso_1se), abs(beta_lasso_min))
betas |> 
  paged_table(options = 
                list(
                  rows.print = nrow(betas)
                  )
              )

Coefficient Plots

One Model

betas |> 
  ggplot(aes(x = beta_lasso_1se, y = term)) +
  geom_vline(xintercept = 0,
             color = 'maroon') +
  geom_pointrange(aes(xmin = 0, xmax = beta_lasso_1se)) +
  labs(y = NULL,
       title = "Lasso Logistic Regression (lambda.1se)")

All Models

betas |> 
  pivot_longer(cols = beta:beta_enet_min,
               names_to = "model",
               values_to = "value") |> 
  ggplot(aes(x = value, y = term, color = model)) +
  geom_vline(xintercept = 0,
             color = 'maroon') +
  geom_pointrange(aes(xmin = 0, 
                      xmax = value),
                  position = position_dodge(width = 0.6)) +
  labs(y = NULL) +
  theme(legend.position = "top")

CV curve

# Run all at once

par(mar = c(5.1, 4.1, 6.1, 2.1))  # title margin
plot(model_lasso, main = "Lasso")
abline(v = log(model_lasso$lambda.min), lty = 2)
abline(v = log(model_lasso$lambda.1se), lty = 3)
legend("bottomright",
       legend = c("lambda.min", "lambda.1se"),
       lty = c(2, 3),
       bty = "n")

  • From this plot alone, it’s hard to pinpoint the exact minimum cross-validation (CV) error (binomial deviance) at lambda.min.
    To see the numeric values directly, check the CV results table (e.g., lasso_CVErrors <- tidy(model_lasso)).

  • We could recreate the CV curve with ggplot2, but the built-in base R plot() is convenient because it also reports extra diagnostics—especially the number of nonzero coefficients at each value of ().

# Run all at once

par(mar = c(5.1, 4.1, 6.1, 2.1))  # title margin
plot(model_ridge, main = "Ridge")
abline(v = log(model_ridge$lambda.min), lty = 2)
abline(v = log(model_ridge$lambda.1se), lty = 3)
legend("bottomright",
       legend = c("lambda.min", "lambda.1se"),
       lty = c(2, 3),
       bty = "n")

# Run all at once

par(mar = c(5.1, 4.1, 6.1, 2.1))  # title margin
plot(model_enet, main = "Elastic Net")
abline(v = log(model_enet$lambda.min), lty = 2)
abline(v = log(model_enet$lambda.1se), lty = 3)
legend("bottomright",
       legend = c("lambda.min", "lambda.1se"),
       lty = c(2, 3),
       bty = "n")

LASSO Coefficient path

# Run all at once

par(mar = c(5.1, 4.1, 6.1, 2.1))  # title margin
plot(model_lasso$glmnet.fit, xvar = "lambda",
     main = "Lasso")
abline(v = log(model_lasso$lambda.min), lty = 2)
abline(v = log(model_lasso$lambda.1se), lty = 3)
legend("topright",
       legend = c("lambda.min", "lambda.1se"),
       lty = c(2, 3),
       bty = "n")

With Predictor Names

# Run all at once

# Coefficient path (keep the default left-to-right order of log(lambda))
fit <- model_lasso$glmnet.fit
B   <- as.matrix(fit$beta)    # p x L (no intercept)
lam <- fit$lambda

x_all <- log(lam)
x_min <- min(x_all)           # smallest log(lambda)
x_max <- max(x_all)

# add extra space to the LEFT (values smaller than the smallest log(lambda))
pad <- 0.6

plot(fit, xvar = "lambda", label = FALSE, main = "Lasso",
     xlim = c(x_min - pad, x_max))  
abline(v = log(model_lasso$lambda.min), lty = 2)
abline(v = log(model_lasso$lambda.1se), lty = 3)
legend("topright",
       legend = c("lambda.min", "lambda.1se"),
       lty = c(2, 3),
       bty = "n")

# label at the smallest-lambda end (where curves end)
y_end <- B[, ncol(B)]

# label only a few to avoid clutter
K <- 15  # number of variable names to display
idx <- order(abs(y_end), decreasing = TRUE)[1:min(K, length(y_end))]

# place labels in the extra left margin (x < min log(lambda))
text(
  x      = rep(x_min - 0.75, length(idx)),
  y      = y_end[idx],
  labels = rownames(B)[idx],
  pos    = 4,       # text to the right of the label position
  cex    = 0.7,
  xpd    = NA
)

Classification

Prediction with predict() & Confusion Matrix

# threshold <- mean(dtest$fail)
# threshold <- 0.5

df_cm_all <- dtest |>
  mutate(
    .fitted = predict(model, newdata = dtest,
                      type = "response"),
    .fitted_lasso_1se = predict(model_lasso, newx = X_test, 
                                s = "lambda.1se", type = "response"),
    .fitted_lasso_min = predict(model_lasso, newx = X_test, 
                                s = "lambda.min", type = "response"),
    .fitted_ridge_1se = predict(model_ridge, newx = X_test, 
                                s = "lambda.1se", type = "response"),
    .fitted_ridge_min = predict(model_ridge, newx = X_test, 
                                s = "lambda.min", type = "response"),
    .fitted_enet_1se = predict(model_enet, newx = X_test, 
                               s = "lambda.1se", type = "response"),
    .fitted_enet_min = predict(model_enet, newx = X_test, 
                               s = "lambda.min", type = "response")
  ) |> 
  mutate(
    actual = factor(fail, 
                    levels = c(0, 1), 
                    labels = c("pass", "fail")),
    pred   = factor(if_else(.fitted > threshold, 1, 0), 
                    levels = c(0, 1), 
                    labels = c("pass", "fail")),
    pred_lasso   = factor(if_else(.fitted_lasso_1se > threshold, 1, 0), 
                          levels = c(0, 1), 
                          labels = c("pass", "fail")),
    pred_ridge   = factor(if_else(.fitted_ridge_1se > threshold, 1, 0), 
                          levels = c(0, 1), 
                          labels = c("pass", "fail")),
    pred_enet   = factor(if_else(.fitted_enet_1se > threshold, 1, 0),   
                         levels = c(0, 1), 
                         labels = c("pass", "fail"))
  )
# Logistic
conf_mat <- table(truth = df_cm_all$actual,
                  prediction_logitstic = df_cm_all$pred)
conf_mat
      prediction_logitstic
truth  pass fail
  pass  157    7
  fail   17  316
# Lasso Logistic
conf_mat_lasso <- table(truth = df_cm_all$actual,
                        prediction_lasso = df_cm_all$pred_lasso)
conf_mat_lasso
      prediction_lasso
truth  pass fail
  pass  156    8
  fail   15  318
# Ridge Logistic
conf_mat_ridge <- table(truth = df_cm_all$actual,
                        prediction_ridge = df_cm_all$pred_ridge)
conf_mat_ridge
      prediction_ridge
truth  pass fail
  pass  144   20
  fail   10  323
# Elastic Net Logistic
conf_mat_enet <- table(truth = df_cm_all$actual,
                        prediction_enet = df_cm_all$pred_enet)
conf_mat_enet
      prediction_enet
truth  pass fail
  pass  157    7
  fail   17  316

Performance Metrics (from conf_mat)

base_rate   <- mean(dtest$fail)

# Logistic
accuracy <- (conf_mat[1,1] + conf_mat[2,2]) / sum(conf_mat) 
precision <- conf_mat[2,2] / sum(conf_mat[,2]) 
recall <- conf_mat[2,2] / sum(conf_mat[2,]) 
specificity <- conf_mat[1,1] / sum(conf_mat[1,]) 
enrichment <- precision / base_rate

# Lasso Logistic
accuracy_lasso <- (conf_mat_lasso[1,1] + conf_mat_lasso[2,2]) / sum(conf_mat_lasso) 
precision_lasso <- conf_mat_lasso[2,2] / sum(conf_mat_lasso[,2]) 
recall_lasso <- conf_mat_lasso[2,2] / sum(conf_mat_lasso[2,]) 
specificity_lasso <- conf_mat_lasso[1,1] / sum(conf_mat_lasso[1,]) 
enrichment_lasso <- precision_lasso / base_rate

# Ridge Logistic
accuracy_ridge <- (conf_mat_ridge[1,1] + conf_mat_ridge[2,2]) / sum(conf_mat_ridge) 
precision_ridge <- conf_mat_ridge[2,2] / sum(conf_mat_ridge[,2]) 
recall_ridge <- conf_mat_ridge[2,2] / sum(conf_mat_ridge[2,]) 
specificity_ridge <- conf_mat_ridge[1,1] / sum(conf_mat_ridge[1,]) 
enrichment_ridge <- precision_ridge / base_rate

# Elastic Net Logistic
accuracy_enet <- (conf_mat_enet[1,1] + conf_mat_enet[2,2]) / sum(conf_mat_enet) 
precision_enet <- conf_mat_enet[2,2] / sum(conf_mat_enet[,2]) 
recall_enet <- conf_mat_enet[2,2] / sum(conf_mat_enet[2,]) 
specificity_enet <- conf_mat_enet[1,1] / sum(conf_mat_enet[1,]) 
enrichment_enet <- precision_enet / base_rate


df_class_metric <- 
  data.frame(
    metric = c("Base rate", 
               "Accuracy", 
               "Precision", 
               "Recall", 
               "Specificity", 
               "Enrichment"),
    value_logistic  = c(base_rate, 
                        accuracy, 
                        precision, 
                        recall, 
                        specificity, 
                        enrichment),
    value_lasso  = c(base_rate, 
                     accuracy_lasso, 
                     precision_lasso, 
                     recall_lasso, 
                     specificity_lasso, 
                     enrichment_lasso),
    value_ridge  = c(base_rate, 
                     accuracy_ridge, 
                     precision_ridge, 
                     recall_ridge, 
                     specificity_ridge, 
                     enrichment_ridge),
    value_enet  = c(base_rate, 
                    accuracy_enet, 
                    precision_enet, 
                    recall_enet, 
                    specificity_enet, 
                    enrichment_enet)
    )

df_class_metric |> 
  mutate(value_logistic = round(value_logistic, 4),
         value_lasso = round(value_lasso, 4),
         value_ridge = round(value_ridge, 4),
         value_enet = round(value_enet, 4)
         ) |> 
  rmarkdown::paged_table()

Precision/Recall/Enrichment Curves over Thresholds (using WVPlots package)

plt <- PRTPlot(df_cm_all, 
               ".fitted", "fail", 1,
               plotvars = c("enrichment", "precision", "recall", "specificity", "false_positive_rate"),
               thresholdrange = c(.4,.8),
               title = "With threshold for Logistic Regression Model")

plt + 
  geom_vline(xintercept = threshold, 
             color="maroon", 
             linetype = 2)

plt_lasso <- PRTPlot(df_cm_all, 
               ".fitted_lasso_1se", "fail", 1,
               plotvars = c("enrichment", "precision", "recall", "specificity", "false_positive_rate"),
               thresholdrange = c(.4,.8),
               title = "With threshold for Lasso Logistic Regression Model")

plt_lasso + 
  geom_vline(xintercept = threshold, 
             color="maroon", 
             linetype = 2)

plt_ridge <- PRTPlot(df_cm_all, 
               ".fitted_ridge_1se", "fail", 1,
               plotvars = c("enrichment", "precision", "recall", "specificity", "false_positive_rate"),
               thresholdrange = c(.4,.8),
               title = "With threshold for Ridge Logistic Regression Model")

plt_ridge + 
  geom_vline(xintercept = threshold, 
             color="maroon", 
             linetype = 2)

plt_enet <- PRTPlot(df_cm_all, 
               ".fitted_enet_1se", "fail", 1,
               plotvars = c("enrichment", "precision", "recall", "specificity", "false_positive_rate"),
               thresholdrange = c(.4,.8),
               title = "With threshold for Elastic Net Logistic Regression Model")

plt_enet + 
  geom_vline(xintercept = threshold, 
             color="maroon", 
             linetype = 2)

ROC (using WVPlots package)

roc <- ROCPlot(df_cm_all,
               xvar = '.fitted',
               truthVar = 'fail',
               truthTarget = 1,
               title = 'Logistic Regression Model')

# ROC with vertical line
roc + 
  geom_vline(xintercept = 1 - specificity, 
             color="maroon", linetype = 2)

roc_lasso <- ROCPlot(df_cm_all,
                     xvar = '.fitted_lasso_1se',
                     truthVar = 'fail',
                     truthTarget = 1,
                     title = 'Lasso Logistic Regression Model')

# ROC with vertical line
roc_lasso + 
  geom_vline(xintercept = 1 - specificity_lasso, 
             color="maroon", linetype = 2)

roc_ridge <- ROCPlot(df_cm_all,
                     xvar = '.fitted_ridge_1se',
                     truthVar = 'fail',
                     truthTarget = 1,
                     title = 'Ridge Logistic Regression Model')

# ROC with vertical line
roc_ridge + 
  geom_vline(xintercept = 1 - specificity_ridge, 
             color="maroon", linetype = 2)

roc_enet <- ROCPlot(df_cm_all,
                     xvar = '.fitted_enet_1se',
                     truthVar = 'fail',
                     truthTarget = 1,
                     title = 'Elastic Net Logistic Regression Model')

# ROC with vertical line
roc_enet + 
  geom_vline(xintercept = 1 - specificity_enet, 
             color="maroon", linetype = 2)

AUC (using pROC package)

# ROC objects
roc_logit <- roc(df_cm_all$fail, df_cm_all$.fitted)
roc_lasso <- roc(df_cm_all$fail, df_cm_all$.fitted_lasso_1se)
roc_ridge <- roc(df_cm_all$fail, df_cm_all$.fitted_ridge_1se)
roc_enet  <- roc(df_cm_all$fail, df_cm_all$.fitted_enet_1se)


auc_df <- data.frame(
  model = c("Logistic", "Lasso (lambda.1se)", "Ridge (lambda.1se)", "Elastic Net (lambda.1se)"),
  auc   = c(
    as.numeric(auc(roc_logit)),
    as.numeric(auc(roc_lasso)),
    as.numeric(auc(roc_ridge)),
    as.numeric(auc(roc_enet))
  )
)

auc_df |> 
  arrange(-auc) |> 
  paged_table()
Back to top