from google.colab import data_table
data_table.enable_dataframe_formatter()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
Lasso Logistic Regression
NHL Player Evaluation
# 2002-2003 season
= pd.read_csv('https://bcdanl.github.io/data/NHL_data_2002_2003.csv')
nhl nhl
Warning: Total number of columns (964) exceeds max_columns (20). Falling back to pandas display.
homegoal | period | differential | playoffs | S6v5 | S6v4 | S6v3 | S5v4 | S5v3 | S4v3 | ... | CHRIS_PRONGER | KURTIS_FOSTER | MILAN_BARTOVIC | JOE_DIPENTA | KAMIL_PIROS | KENT_MCDONELL | BILL_MUCKALT | MATT_STAJAN | TOMI_PETTINEN | PETER_SEJNA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 1 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 2 | -2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 2 | -1 | 0 | 0 | 0 | 0 | -1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 3 | -2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5300 | 1 | 1 | 2 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5301 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5302 | 1 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5303 | 1 | 3 | 3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5304 | 0 | 3 | 4 | 1 | 0 | 0 | 0 | -1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5305 rows × 964 columns
nhl.columns
Index(['homegoal', 'period', 'differential', 'playoffs', 'S6v5', 'S6v4',
'S6v3', 'S5v4', 'S5v3', 'S4v3',
...
'CHRIS_PRONGER', 'KURTIS_FOSTER', 'MILAN_BARTOVIC', 'JOE_DIPENTA',
'KAMIL_PIROS', 'KENT_MCDONELL', 'BILL_MUCKALT', 'MATT_STAJAN',
'TOMI_PETTINEN', 'PETER_SEJNA'],
dtype='object', length=964)
# Split into train and test (70% train, 30% test)
# Using a fixed random state for reproducibility (seed = 24351)
= train_test_split(nhl, test_size=0.3, random_state=24351)
nhl_train, nhl_test
# Define predictors: all columns except "rating" and "fail"
= [col for col in nhl.columns if col not in ['homegoal']]
predictors
= nhl_train[predictors]
X_train = nhl_train[predictors]
X_test
# Outcome variable
= nhl_train['homegoal']
y_train = nhl_test['homegoal'] y_test
# Revised LogisticRegressionCV with fewer candidate Cs, fewer folds, and looser tolerance:
= LogisticRegressionCV(
lasso_cv =10, # Fewer candidate values
Cs=3, # Fewer CV folds
cv='l1',
penalty='saga',
solver=1000,
max_iter=1e-3, # Looser tolerance for faster convergence
tol='neg_log_loss'
scoring
)
lasso_cv.fit(X_train.values, y_train.values)
print("Best alpha:", 1 / lasso_cv.C_[0])
= float(lasso_cv.intercept_[0])
intercept = pd.DataFrame({
coef_lasso 'predictor': list(X_train.columns),
'coefficient': list(lasso_cv.coef_[0]),
'exp_coefficient': np.exp( list(lasso_cv.coef_[0]) ),
})
Best alpha: 2.782559402207126
0]) np.exp(lasso_cv.intercept_[
np.float64(1.052809822818754)
= coef_lasso.query('coefficient != 0') coef_lasso
Hometeam Effect
intercept
0.05146261173037637
np.exp(intercept)
np.float64(1.052809822818754)
Player Evaluation
= ['period', 'differential', 'playoffs',
free_cols 'S6v5','S6v4','S5v4','S5v3','S4v3','SNG',
'ATL.20022003',
'ANA.20022003','BOS.20022003','BUF.20022003','CAR.20022003',
'CBJ.20022003','CGY.20022003','CHI.20022003','COL.20022003',
'DAL.20022003','DET.20022003','EDM.20022003','FLA.20022003',
'LAK.20022003','MIN.20022003','MTL.20022003','NJD.20022003',
'NSH.20022003','NYI.20022003','NYR.20022003','OTT.20022003',
'PHI.20022003','PHX.20022003','PIT.20022003','SJS.20022003',
'STL.20022003','TBL.20022003','TOR.20022003','VAN.20022003',
'WPG.20022003','WSH.20022003']
= coef_lasso[ ~coef_lasso['predictor'].isin(free_cols) ] coef_lasso
'coefficient', ascending = False) coef_lasso.sort_values(
predictor | coefficient | exp_coefficient | |
---|---|---|---|
612 | PETER_FORSBERG | 0.968515 | 2.634030 |
449 | BRAD_RICHARDS | 0.937330 | 2.553155 |
729 | CHRIS_NEIL | 0.703128 | 2.020062 |
653 | NICKLAS_LIDSTROM | 0.556389 | 1.744362 |
149 | PETER_SCHAEFER | 0.544168 | 1.723174 |
... | ... | ... | ... |
774 | MARTIN_LAPOINTE | -0.603984 | 0.546630 |
599 | JAMIE_MCLENNAN | -0.625651 | 0.534913 |
421 | BEN_CLYMER | -0.747552 | 0.473524 |
484 | KARLIS_SKRASTINS | -0.754297 | 0.470341 |
668 | PETER_WORRELL | -0.815675 | 0.442341 |
224 rows × 3 columns
PM vs. Lasso-based Expected PM
import numpy as np
import pandas as pd
# Instead of deriving player_cols from nhl, we use the predictors from coef_lasso.
# These should be the players that were penalized (i.e. not in free_cols).
= list(coef_lasso['predictor'])
player_cols
# Subset the nhl DataFrame to include only these player columns.
= nhl[player_cols]
nhl_players
# -------------------------------------
# 1. Traditional Plus-Minus (PM)
# -------------------------------------
# Each row in nhl_players should contain:
# +1 if the goal was for the player's team,
# -1 if it was against,
# 0 if the player was not on the ice.
# The traditional PM for each player is simply the column sum.
= nhl_players.sum()
traditional_pm
# -------------------------------------
# 2. Total Number of Goals (ng)
# -------------------------------------
# For each player, the total goals they're on the ice for is the sum of the absolute values.
= nhl_players.abs().sum()
ng
# -------------------------------------
# 3. Expected Plus-Minus (ppm)
# -------------------------------------
# Create beta as a Series from coef_lasso:
= pd.Series(coef_lasso['coefficient'].values, index=coef_lasso['predictor'])
beta
# Convert beta to a probability using the logistic function:
# p = exp(beta) / (1 + exp(beta))
= np.exp(beta) / (1 + np.exp(beta))
p
# Compute expected plus-minus:
# ppm = ng * (p) - ng * (1-p) = ng * (2p - 1)
= ng * (p) - ng * (1-p)
expected_pm
# -------------------------------------
# 4. Combine and Display the Results
# -------------------------------------
= pd.DataFrame({
effect_df 'pm': traditional_pm,
'ng': ng,
'beta': beta,
'exp_beta': np.exp(beta),
'p': p,
'ppm': expected_pm
})
# Display the top 10 players sorted by expected plus-minus (descending).
effect_df
pm | ng | beta | exp_beta | p | ppm | |
---|---|---|---|---|---|---|
MIKE_COMRIE | -19 | 121 | -0.108854 | 0.896862 | 0.472813 | -6.579147 |
DERIAN_HATCHER | 8 | 178 | 0.207467 | 1.230557 | 0.551681 | 18.398612 |
MANNY_MALHOTRA | 1 | 31 | -0.277163 | 0.757931 | 0.431149 | -4.268742 |
TODD_MARCHANT | 0 | 134 | 0.220145 | 1.246258 | 0.554815 | 14.690460 |
RICHARD_MATVICHUK | 7 | 91 | -0.185825 | 0.830419 | 0.453677 | -8.430777 |
... | ... | ... | ... | ... | ... | ... |
SAMUEL_PAHLSSON | 21 | 61 | 0.031758 | 1.032268 | 0.507939 | 0.968551 |
KYLE_MCLAREN | -26 | 26 | -0.011054 | 0.989007 | 0.497237 | -0.143694 |
BRIAN_POTHIER | 1 | 15 | 0.474937 | 1.607914 | 0.616552 | 3.496551 |
RYAN_BAYDA | -5 | 39 | 0.283464 | 1.327721 | 0.570395 | 5.490826 |
BURKE_HENRY | 4 | 30 | -0.261737 | 0.769713 | 0.434937 | -3.903798 |
224 rows × 6 columns
= effect_df.nlargest(10, 'pm', keep = 'all')
top10_by_pm top10_by_pm
pm | ng | beta | exp_beta | p | ppm | |
---|---|---|---|---|---|---|
CHRIS_OSGOOD | 94 | 226 | 0.138557 | 1.148616 | 0.534584 | 15.631994 |
SERGEI_GONCHAR | 78 | 196 | 0.181795 | 1.199368 | 0.545324 | 17.766998 |
MARTY_TURCO | 74 | 252 | 0.435821 | 1.546232 | 0.607263 | 54.060453 |
JOSE_THEODORE | 58 | 268 | -0.012309 | 0.987766 | 0.496923 | -1.649451 |
PATRICK_ROY | 54 | 318 | 0.399092 | 1.490471 | 0.598470 | 62.626629 |
ROBERT_LANG | 49 | 125 | 0.406744 | 1.501919 | 0.600307 | 25.076706 |
JEAN-SEBASTIEN_GIGUERE | 47 | 305 | 0.341136 | 1.406544 | 0.584466 | 51.524524 |
SEAN_BURKE | 40 | 74 | 0.161521 | 1.175298 | 0.540293 | 5.963336 |
DAN_CLOUTIER | 39 | 337 | 0.054591 | 1.056109 | 0.513644 | 9.196343 |
ED_BELFOUR | 38 | 316 | 0.054281 | 1.055782 | 0.513567 | 8.574336 |
= effect_df.nlargest(10, 'beta', keep = 'all')
top10_by_beta top10_by_beta
pm | ng | beta | exp_beta | p | ppm | |
---|---|---|---|---|---|---|
PETER_FORSBERG | 16 | 170 | 0.968515 | 2.634030 | 0.724823 | 76.439952 |
BRAD_RICHARDS | -64 | 66 | 0.937330 | 2.553155 | 0.718560 | 28.849915 |
CHRIS_NEIL | -2 | 32 | 0.703128 | 2.020062 | 0.668881 | 10.808380 |
NICKLAS_LIDSTROM | 12 | 222 | 0.556389 | 1.744362 | 0.635617 | 60.213773 |
PETER_SCHAEFER | 5 | 83 | 0.544168 | 1.723174 | 0.632781 | 22.041725 |
LADISLAV_NAGY | 25 | 103 | 0.527763 | 1.695136 | 0.628961 | 26.566011 |
MARTIN_HAVLAT | 22 | 116 | 0.481076 | 1.617814 | 0.618002 | 27.376455 |
BRIAN_POTHIER | 1 | 15 | 0.474937 | 1.607914 | 0.616552 | 3.496551 |
DAVID_VYBORNY | -8 | 98 | 0.441356 | 1.554814 | 0.608582 | 21.282089 |
MARTY_TURCO | 74 | 252 | 0.435821 | 1.546232 | 0.607263 | 54.060453 |
= effect_df.nlargest(10, 'ppm', keep = 'all')
top10_by_ppm top10_by_ppm
pm | ng | beta | exp_beta | p | ppm | |
---|---|---|---|---|---|---|
PETER_FORSBERG | 16 | 170 | 0.968515 | 2.634030 | 0.724823 | 76.439952 |
PATRICK_ROY | 54 | 318 | 0.399092 | 1.490471 | 0.598470 | 62.626629 |
NICKLAS_LIDSTROM | 12 | 222 | 0.556389 | 1.744362 | 0.635617 | 60.213773 |
MARTY_TURCO | 74 | 252 | 0.435821 | 1.546232 | 0.607263 | 54.060453 |
JEAN-SEBASTIEN_GIGUERE | 47 | 305 | 0.341136 | 1.406544 | 0.584466 | 51.524524 |
ROMAN_CECHMANEK | 22 | 244 | 0.355871 | 1.427423 | 0.588040 | 42.963741 |
MIKE_DUNHAM | 14 | 212 | 0.371373 | 1.449723 | 0.591791 | 38.919209 |
SANDIS_OZOLINSH | 20 | 184 | 0.407174 | 1.502566 | 0.600410 | 36.950905 |
ROMAN_TUREK | 20 | 242 | 0.288680 | 1.334665 | 0.571673 | 34.689751 |
ED_JOVANOVSKI | -2 | 172 | 0.369647 | 1.447223 | 0.591374 | 31.432520 |