Lasso Logistic Regression

NHL Player Evaluation

Author

Byeong-Hak Choe

Published

April 7, 2025

Modified

April 19, 2025

from google.colab import data_table
data_table.enable_dataframe_formatter()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# 2002-2003 season
nhl = pd.read_csv('https://bcdanl.github.io/data/NHL_data_2002_2003.csv')
nhl

Warning: Total number of columns (964) exceeds max_columns (20). Falling back to pandas display.

	homegoal	period	differential	playoffs	S6v5	S6v4	S6v3	S5v4	S5v3	S4v3	...	CHRIS_PRONGER	KURTIS_FOSTER	MILAN_BARTOVIC	JOE_DIPENTA	KAMIL_PIROS	KENT_MCDONELL	BILL_MUCKALT	MATT_STAJAN	TOMI_PETTINEN	PETER_SEJNA
0	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	1	-1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	1	2	-2	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	2	-1	0	0	0	0	-1	0	0	...	0	0	0	0	0	0	0	0	0	0
4	1	3	-2	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5300	1	1	2	1	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
5301	0	2	3	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
5302	1	2	2	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
5303	1	3	3	1	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
5304	0	3	4	1	0	0	0	-1	0	0	...	0	0	0	0	0	0	0	0	0	0

5305 rows × 964 columns

nhl.columns

Index(['homegoal', 'period', 'differential', 'playoffs', 'S6v5', 'S6v4',
       'S6v3', 'S5v4', 'S5v3', 'S4v3',
       ...
       'CHRIS_PRONGER', 'KURTIS_FOSTER', 'MILAN_BARTOVIC', 'JOE_DIPENTA',
       'KAMIL_PIROS', 'KENT_MCDONELL', 'BILL_MUCKALT', 'MATT_STAJAN',
       'TOMI_PETTINEN', 'PETER_SEJNA'],
      dtype='object', length=964)

# Split into train and test (70% train, 30% test)

# Using a fixed random state for reproducibility (seed = 24351)
nhl_train, nhl_test = train_test_split(nhl, test_size=0.3, random_state=24351)

# Define predictors: all columns except "rating" and "fail"
predictors = [col for col in nhl.columns if col not in ['homegoal']]

X_train = nhl_train[predictors]
X_test = nhl_train[predictors]

# Outcome variable
y_train = nhl_train['homegoal']
y_test = nhl_test['homegoal']

# Revised LogisticRegressionCV with fewer candidate Cs, fewer folds, and looser tolerance:
lasso_cv = LogisticRegressionCV(
    Cs=10,         # Fewer candidate values
    cv=3,          # Fewer CV folds
    penalty='l1',
    solver='saga',
    max_iter=1000,
    tol=1e-3,      # Looser tolerance for faster convergence
    scoring='neg_log_loss'
)
lasso_cv.fit(X_train.values, y_train.values)

print("Best alpha:", 1 / lasso_cv.C_[0])

intercept = float(lasso_cv.intercept_[0])
coef_lasso = pd.DataFrame({
    'predictor': list(X_train.columns),
    'coefficient': list(lasso_cv.coef_[0]),
    'exp_coefficient': np.exp( list(lasso_cv.coef_[0]) ),
})

Best alpha: 2.782559402207126

np.exp(lasso_cv.intercept_[0])

np.float64(1.052809822818754)

coef_lasso = coef_lasso.query('coefficient != 0')

Hometeam Effect

intercept

0.05146261173037637

np.exp(intercept)

np.float64(1.052809822818754)

Player Evaluation


free_cols = ['period', 'differential', 'playoffs',
             'S6v5','S6v4','S5v4','S5v3','S4v3','SNG',
             'ATL.20022003',
             'ANA.20022003','BOS.20022003','BUF.20022003','CAR.20022003',
             'CBJ.20022003','CGY.20022003','CHI.20022003','COL.20022003',
             'DAL.20022003','DET.20022003','EDM.20022003','FLA.20022003',
             'LAK.20022003','MIN.20022003','MTL.20022003','NJD.20022003',
             'NSH.20022003','NYI.20022003','NYR.20022003','OTT.20022003',
             'PHI.20022003','PHX.20022003','PIT.20022003','SJS.20022003',
             'STL.20022003','TBL.20022003','TOR.20022003','VAN.20022003',
             'WPG.20022003','WSH.20022003']

coef_lasso = coef_lasso[ ~coef_lasso['predictor'].isin(free_cols) ]

coef_lasso.sort_values('coefficient', ascending = False)

	predictor	coefficient	exp_coefficient
612	PETER_FORSBERG	0.968515	2.634030
449	BRAD_RICHARDS	0.937330	2.553155
729	CHRIS_NEIL	0.703128	2.020062
653	NICKLAS_LIDSTROM	0.556389	1.744362
149	PETER_SCHAEFER	0.544168	1.723174
...	...	...	...
774	MARTIN_LAPOINTE	-0.603984	0.546630
599	JAMIE_MCLENNAN	-0.625651	0.534913
421	BEN_CLYMER	-0.747552	0.473524
484	KARLIS_SKRASTINS	-0.754297	0.470341
668	PETER_WORRELL	-0.815675	0.442341

224 rows × 3 columns

PM vs. Lasso-based Expected PM

import numpy as np
import pandas as pd

# Instead of deriving player_cols from nhl, we use the predictors from coef_lasso.
# These should be the players that were penalized (i.e. not in free_cols).
player_cols = list(coef_lasso['predictor'])

# Subset the nhl DataFrame to include only these player columns.
nhl_players = nhl[player_cols]

# -------------------------------------
# 1. Traditional Plus-Minus (PM)
# -------------------------------------
# Each row in nhl_players should contain:
#   +1 if the goal was for the player's team,
#   -1 if it was against,
#    0 if the player was not on the ice.
# The traditional PM for each player is simply the column sum.
traditional_pm = nhl_players.sum()

# -------------------------------------
# 2. Total Number of Goals (ng)
# -------------------------------------
# For each player, the total goals they're on the ice for is the sum of the absolute values.
ng = nhl_players.abs().sum()

# -------------------------------------
# 3. Expected Plus-Minus (ppm)
# -------------------------------------
# Create beta as a Series from coef_lasso:
beta = pd.Series(coef_lasso['coefficient'].values, index=coef_lasso['predictor'])

# Convert beta to a probability using the logistic function:
#   p = exp(beta) / (1 + exp(beta))
p = np.exp(beta) / (1 + np.exp(beta))

# Compute expected plus-minus:
#   ppm = ng * (p) - ng * (1-p) = ng * (2p - 1)
expected_pm = ng * (p) - ng * (1-p)

# -------------------------------------
# 4. Combine and Display the Results
# -------------------------------------
effect_df = pd.DataFrame({
    'pm': traditional_pm,
    'ng': ng,
    'beta': beta,
    'exp_beta': np.exp(beta),
    'p': p,
    'ppm': expected_pm
})

# Display the top 10 players sorted by expected plus-minus (descending).
effect_df

	pm	ng	beta	exp_beta	p	ppm
MIKE_COMRIE	-19	121	-0.108854	0.896862	0.472813	-6.579147
DERIAN_HATCHER	8	178	0.207467	1.230557	0.551681	18.398612
MANNY_MALHOTRA	1	31	-0.277163	0.757931	0.431149	-4.268742
TODD_MARCHANT	0	134	0.220145	1.246258	0.554815	14.690460
RICHARD_MATVICHUK	7	91	-0.185825	0.830419	0.453677	-8.430777
...	...	...	...	...	...	...
SAMUEL_PAHLSSON	21	61	0.031758	1.032268	0.507939	0.968551
KYLE_MCLAREN	-26	26	-0.011054	0.989007	0.497237	-0.143694
BRIAN_POTHIER	1	15	0.474937	1.607914	0.616552	3.496551
RYAN_BAYDA	-5	39	0.283464	1.327721	0.570395	5.490826
BURKE_HENRY	4	30	-0.261737	0.769713	0.434937	-3.903798

224 rows × 6 columns

top10_by_pm = effect_df.nlargest(10, 'pm', keep = 'all')
top10_by_pm

	pm	ng	beta	exp_beta	p	ppm
CHRIS_OSGOOD	94	226	0.138557	1.148616	0.534584	15.631994
SERGEI_GONCHAR	78	196	0.181795	1.199368	0.545324	17.766998
MARTY_TURCO	74	252	0.435821	1.546232	0.607263	54.060453
JOSE_THEODORE	58	268	-0.012309	0.987766	0.496923	-1.649451
PATRICK_ROY	54	318	0.399092	1.490471	0.598470	62.626629
ROBERT_LANG	49	125	0.406744	1.501919	0.600307	25.076706
JEAN-SEBASTIEN_GIGUERE	47	305	0.341136	1.406544	0.584466	51.524524
SEAN_BURKE	40	74	0.161521	1.175298	0.540293	5.963336
DAN_CLOUTIER	39	337	0.054591	1.056109	0.513644	9.196343
ED_BELFOUR	38	316	0.054281	1.055782	0.513567	8.574336

top10_by_beta = effect_df.nlargest(10, 'beta', keep = 'all')
top10_by_beta

	pm	ng	beta	exp_beta	p	ppm
PETER_FORSBERG	16	170	0.968515	2.634030	0.724823	76.439952
BRAD_RICHARDS	-64	66	0.937330	2.553155	0.718560	28.849915
CHRIS_NEIL	-2	32	0.703128	2.020062	0.668881	10.808380
NICKLAS_LIDSTROM	12	222	0.556389	1.744362	0.635617	60.213773
PETER_SCHAEFER	5	83	0.544168	1.723174	0.632781	22.041725
LADISLAV_NAGY	25	103	0.527763	1.695136	0.628961	26.566011
MARTIN_HAVLAT	22	116	0.481076	1.617814	0.618002	27.376455
BRIAN_POTHIER	1	15	0.474937	1.607914	0.616552	3.496551
DAVID_VYBORNY	-8	98	0.441356	1.554814	0.608582	21.282089
MARTY_TURCO	74	252	0.435821	1.546232	0.607263	54.060453

top10_by_ppm = effect_df.nlargest(10, 'ppm', keep = 'all')
top10_by_ppm

	pm	ng	beta	exp_beta	p	ppm
PETER_FORSBERG	16	170	0.968515	2.634030	0.724823	76.439952
PATRICK_ROY	54	318	0.399092	1.490471	0.598470	62.626629
NICKLAS_LIDSTROM	12	222	0.556389	1.744362	0.635617	60.213773
MARTY_TURCO	74	252	0.435821	1.546232	0.607263	54.060453
JEAN-SEBASTIEN_GIGUERE	47	305	0.341136	1.406544	0.584466	51.524524
ROMAN_CECHMANEK	22	244	0.355871	1.427423	0.588040	42.963741
MIKE_DUNHAM	14	212	0.371373	1.449723	0.591791	38.919209
SANDIS_OZOLINSH	20	184	0.407174	1.502566	0.600410	36.950905
ROMAN_TUREK	20	242	0.288680	1.334665	0.571673	34.689751
ED_JOVANOVSKI	-2	172	0.369647	1.447223	0.591374	31.432520