Lasso Logistic Regression

NHL Player Evaluation

Author

Byeong-Hak Choe

Published

April 7, 2025

Modified

April 19, 2025

from google.colab import data_table
data_table.enable_dataframe_formatter()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# 2002-2003 season
nhl = pd.read_csv('https://bcdanl.github.io/data/NHL_data_2002_2003.csv')
nhl
Warning: Total number of columns (964) exceeds max_columns (20). Falling back to pandas display.
homegoal period differential playoffs S6v5 S6v4 S6v3 S5v4 S5v3 S4v3 ... CHRIS_PRONGER KURTIS_FOSTER MILAN_BARTOVIC JOE_DIPENTA KAMIL_PIROS KENT_MCDONELL BILL_MUCKALT MATT_STAJAN TOMI_PETTINEN PETER_SEJNA
0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 2 -2 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 2 -1 0 0 0 0 -1 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 3 -2 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5300 1 1 2 1 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
5301 0 2 3 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5302 1 2 2 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5303 1 3 3 1 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
5304 0 3 4 1 0 0 0 -1 0 0 ... 0 0 0 0 0 0 0 0 0 0

5305 rows × 964 columns

nhl.columns
Index(['homegoal', 'period', 'differential', 'playoffs', 'S6v5', 'S6v4',
       'S6v3', 'S5v4', 'S5v3', 'S4v3',
       ...
       'CHRIS_PRONGER', 'KURTIS_FOSTER', 'MILAN_BARTOVIC', 'JOE_DIPENTA',
       'KAMIL_PIROS', 'KENT_MCDONELL', 'BILL_MUCKALT', 'MATT_STAJAN',
       'TOMI_PETTINEN', 'PETER_SEJNA'],
      dtype='object', length=964)
# Split into train and test (70% train, 30% test)

# Using a fixed random state for reproducibility (seed = 24351)
nhl_train, nhl_test = train_test_split(nhl, test_size=0.3, random_state=24351)

# Define predictors: all columns except "rating" and "fail"
predictors = [col for col in nhl.columns if col not in ['homegoal']]

X_train = nhl_train[predictors]
X_test = nhl_train[predictors]

# Outcome variable
y_train = nhl_train['homegoal']
y_test = nhl_test['homegoal']
# Revised LogisticRegressionCV with fewer candidate Cs, fewer folds, and looser tolerance:
lasso_cv = LogisticRegressionCV(
    Cs=10,         # Fewer candidate values
    cv=3,          # Fewer CV folds
    penalty='l1',
    solver='saga',
    max_iter=1000,
    tol=1e-3,      # Looser tolerance for faster convergence
    scoring='neg_log_loss'
)
lasso_cv.fit(X_train.values, y_train.values)

print("Best alpha:", 1 / lasso_cv.C_[0])

intercept = float(lasso_cv.intercept_[0])
coef_lasso = pd.DataFrame({
    'predictor': list(X_train.columns),
    'coefficient': list(lasso_cv.coef_[0]),
    'exp_coefficient': np.exp( list(lasso_cv.coef_[0]) ),
})
Best alpha: 2.782559402207126
np.exp(lasso_cv.intercept_[0])
np.float64(1.052809822818754)
coef_lasso = coef_lasso.query('coefficient != 0')

Hometeam Effect

intercept
0.05146261173037637
np.exp(intercept)
np.float64(1.052809822818754)

Player Evaluation


free_cols = ['period', 'differential', 'playoffs',
             'S6v5','S6v4','S5v4','S5v3','S4v3','SNG',
             'ATL.20022003',
             'ANA.20022003','BOS.20022003','BUF.20022003','CAR.20022003',
             'CBJ.20022003','CGY.20022003','CHI.20022003','COL.20022003',
             'DAL.20022003','DET.20022003','EDM.20022003','FLA.20022003',
             'LAK.20022003','MIN.20022003','MTL.20022003','NJD.20022003',
             'NSH.20022003','NYI.20022003','NYR.20022003','OTT.20022003',
             'PHI.20022003','PHX.20022003','PIT.20022003','SJS.20022003',
             'STL.20022003','TBL.20022003','TOR.20022003','VAN.20022003',
             'WPG.20022003','WSH.20022003']

coef_lasso = coef_lasso[ ~coef_lasso['predictor'].isin(free_cols) ]
coef_lasso.sort_values('coefficient', ascending = False)
predictor coefficient exp_coefficient
612 PETER_FORSBERG 0.968515 2.634030
449 BRAD_RICHARDS 0.937330 2.553155
729 CHRIS_NEIL 0.703128 2.020062
653 NICKLAS_LIDSTROM 0.556389 1.744362
149 PETER_SCHAEFER 0.544168 1.723174
... ... ... ...
774 MARTIN_LAPOINTE -0.603984 0.546630
599 JAMIE_MCLENNAN -0.625651 0.534913
421 BEN_CLYMER -0.747552 0.473524
484 KARLIS_SKRASTINS -0.754297 0.470341
668 PETER_WORRELL -0.815675 0.442341

224 rows × 3 columns

PM vs. Lasso-based Expected PM

import numpy as np
import pandas as pd

# Instead of deriving player_cols from nhl, we use the predictors from coef_lasso.
# These should be the players that were penalized (i.e. not in free_cols).
player_cols = list(coef_lasso['predictor'])

# Subset the nhl DataFrame to include only these player columns.
nhl_players = nhl[player_cols]

# -------------------------------------
# 1. Traditional Plus-Minus (PM)
# -------------------------------------
# Each row in nhl_players should contain:
#   +1 if the goal was for the player's team,
#   -1 if it was against,
#    0 if the player was not on the ice.
# The traditional PM for each player is simply the column sum.
traditional_pm = nhl_players.sum()

# -------------------------------------
# 2. Total Number of Goals (ng)
# -------------------------------------
# For each player, the total goals they're on the ice for is the sum of the absolute values.
ng = nhl_players.abs().sum()

# -------------------------------------
# 3. Expected Plus-Minus (ppm)
# -------------------------------------
# Create beta as a Series from coef_lasso:
beta = pd.Series(coef_lasso['coefficient'].values, index=coef_lasso['predictor'])

# Convert beta to a probability using the logistic function:
#   p = exp(beta) / (1 + exp(beta))
p = np.exp(beta) / (1 + np.exp(beta))

# Compute expected plus-minus:
#   ppm = ng * (p) - ng * (1-p) = ng * (2p - 1)
expected_pm = ng * (p) - ng * (1-p)

# -------------------------------------
# 4. Combine and Display the Results
# -------------------------------------
effect_df = pd.DataFrame({
    'pm': traditional_pm,
    'ng': ng,
    'beta': beta,
    'exp_beta': np.exp(beta),
    'p': p,
    'ppm': expected_pm
})

# Display the top 10 players sorted by expected plus-minus (descending).
effect_df
pm ng beta exp_beta p ppm
MIKE_COMRIE -19 121 -0.108854 0.896862 0.472813 -6.579147
DERIAN_HATCHER 8 178 0.207467 1.230557 0.551681 18.398612
MANNY_MALHOTRA 1 31 -0.277163 0.757931 0.431149 -4.268742
TODD_MARCHANT 0 134 0.220145 1.246258 0.554815 14.690460
RICHARD_MATVICHUK 7 91 -0.185825 0.830419 0.453677 -8.430777
... ... ... ... ... ... ...
SAMUEL_PAHLSSON 21 61 0.031758 1.032268 0.507939 0.968551
KYLE_MCLAREN -26 26 -0.011054 0.989007 0.497237 -0.143694
BRIAN_POTHIER 1 15 0.474937 1.607914 0.616552 3.496551
RYAN_BAYDA -5 39 0.283464 1.327721 0.570395 5.490826
BURKE_HENRY 4 30 -0.261737 0.769713 0.434937 -3.903798

224 rows × 6 columns

top10_by_pm = effect_df.nlargest(10, 'pm', keep = 'all')
top10_by_pm
pm ng beta exp_beta p ppm
CHRIS_OSGOOD 94 226 0.138557 1.148616 0.534584 15.631994
SERGEI_GONCHAR 78 196 0.181795 1.199368 0.545324 17.766998
MARTY_TURCO 74 252 0.435821 1.546232 0.607263 54.060453
JOSE_THEODORE 58 268 -0.012309 0.987766 0.496923 -1.649451
PATRICK_ROY 54 318 0.399092 1.490471 0.598470 62.626629
ROBERT_LANG 49 125 0.406744 1.501919 0.600307 25.076706
JEAN-SEBASTIEN_GIGUERE 47 305 0.341136 1.406544 0.584466 51.524524
SEAN_BURKE 40 74 0.161521 1.175298 0.540293 5.963336
DAN_CLOUTIER 39 337 0.054591 1.056109 0.513644 9.196343
ED_BELFOUR 38 316 0.054281 1.055782 0.513567 8.574336
top10_by_beta = effect_df.nlargest(10, 'beta', keep = 'all')
top10_by_beta
pm ng beta exp_beta p ppm
PETER_FORSBERG 16 170 0.968515 2.634030 0.724823 76.439952
BRAD_RICHARDS -64 66 0.937330 2.553155 0.718560 28.849915
CHRIS_NEIL -2 32 0.703128 2.020062 0.668881 10.808380
NICKLAS_LIDSTROM 12 222 0.556389 1.744362 0.635617 60.213773
PETER_SCHAEFER 5 83 0.544168 1.723174 0.632781 22.041725
LADISLAV_NAGY 25 103 0.527763 1.695136 0.628961 26.566011
MARTIN_HAVLAT 22 116 0.481076 1.617814 0.618002 27.376455
BRIAN_POTHIER 1 15 0.474937 1.607914 0.616552 3.496551
DAVID_VYBORNY -8 98 0.441356 1.554814 0.608582 21.282089
MARTY_TURCO 74 252 0.435821 1.546232 0.607263 54.060453
top10_by_ppm = effect_df.nlargest(10, 'ppm', keep = 'all')
top10_by_ppm
pm ng beta exp_beta p ppm
PETER_FORSBERG 16 170 0.968515 2.634030 0.724823 76.439952
PATRICK_ROY 54 318 0.399092 1.490471 0.598470 62.626629
NICKLAS_LIDSTROM 12 222 0.556389 1.744362 0.635617 60.213773
MARTY_TURCO 74 252 0.435821 1.546232 0.607263 54.060453
JEAN-SEBASTIEN_GIGUERE 47 305 0.341136 1.406544 0.584466 51.524524
ROMAN_CECHMANEK 22 244 0.355871 1.427423 0.588040 42.963741
MIKE_DUNHAM 14 212 0.371373 1.449723 0.591791 38.919209
SANDIS_OZOLINSH 20 184 0.407174 1.502566 0.600410 36.950905
ROMAN_TUREK 20 242 0.288680 1.334665 0.571673 34.689751
ED_JOVANOVSKI -2 172 0.369647 1.447223 0.591374 31.432520
Back to top