import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
Homework 4 - Part 2: Tree-based Models
MLB Batting
= pd.read_csv("https://bcdanl.github.io/data/mlb_battings_2024.csv")
mlb_battings_2024 mlb_battings_2024.shape
(230, 19)
mlb_battings_2024
g | pa | hr | r | rbi | sb | bb_percent | k_percent | iso | babip | avg | obp | slg | w_oba | w_rc | bs_r | off | def | war | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 421 | 1858 | 157 | 334 | 350 | 29 | 17.9 | 25.6 | 0.370 | 0.341 | 0.304 | 0.433 | 0.674 | 0.455 | 202 | -1.8 | 216.3 | -16.4 | 27.0 |
1 | 473 | 2082 | 90 | 313 | 296 | 76 | 8.7 | 19.1 | 0.207 | 0.291 | 0.266 | 0.340 | 0.473 | 0.350 | 128 | 8.9 | 77.5 | 43.1 | 19.6 |
2 | 451 | 1996 | 132 | 326 | 320 | 90 | 12.2 | 23.3 | 0.309 | 0.332 | 0.296 | 0.385 | 0.605 | 0.411 | 168 | 10.6 | 169.9 | -48.1 | 19.2 |
3 | 467 | 2076 | 72 | 329 | 291 | 45 | 11.3 | 15.6 | 0.206 | 0.347 | 0.314 | 0.399 | 0.520 | 0.391 | 153 | 4.7 | 136.8 | -25.2 | 18.6 |
4 | 469 | 2035 | 82 | 304 | 285 | 110 | 6.2 | 17.8 | 0.217 | 0.316 | 0.288 | 0.336 | 0.505 | 0.356 | 128 | 19.3 | 84.4 | 28.9 | 18.5 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
225 | 412 | 1587 | 51 | 166 | 162 | 12 | 10.3 | 26.1 | 0.177 | 0.272 | 0.221 | 0.303 | 0.397 | 0.305 | 91 | -2.2 | -18.0 | -41.9 | -0.8 |
226 | 417 | 1603 | 53 | 159 | 189 | 13 | 5.5 | 25.1 | 0.158 | 0.300 | 0.247 | 0.289 | 0.405 | 0.300 | 87 | -5.6 | -30.8 | -33.2 | -1.0 |
227 | 288 | 1017 | 25 | 119 | 85 | 15 | 8.3 | 20.7 | 0.120 | 0.263 | 0.223 | 0.293 | 0.343 | 0.282 | 73 | -1.3 | -33.5 | -18.2 | -1.8 |
228 | 313 | 1105 | 36 | 124 | 133 | 15 | 6.1 | 24.2 | 0.160 | 0.261 | 0.221 | 0.268 | 0.381 | 0.280 | 75 | -0.2 | -32.8 | -24.0 | -2.1 |
229 | 381 | 1255 | 35 | 93 | 141 | 3 | 7.8 | 20.6 | 0.140 | 0.262 | 0.227 | 0.291 | 0.368 | 0.288 | 84 | -5.1 | -28.4 | -34.7 | -2.2 |
230 rows × 19 columns
Regression Tree
# Set a random seed for reproducibility (like R's set.seed)
42120532)
np.random.seed(= train_test_split(mlb_battings_2024, test_size=0.20, random_state=42120532) train, test
= train.drop(columns=["war"])
X_train = train["war"]
y_train
= test.drop(columns=["war"])
X_test = test["war"]
y_test
# In scikit-learn, we can use min_impurity_decrease=0.005 for a similar effect.
= DecisionTreeRegressor(min_impurity_decrease=0.005, random_state=42)
tree_model # Fit the model using all predictors (all columns except 'medv')
tree_model.fit(X_train, y_train)
# Predict on training and test sets
= tree_model.predict(X_train)
y_train_pred = tree_model.predict(X_test)
y_test_pred
# Calculate MSE
= mean_squared_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_test, y_test_pred)
mse_test
# Print the results
print(f"Training MSE: {mse_train:.3f}")
print(f"Test MSE: {mse_test:.3f}")
# Plot the initial regression tree
=(12, 8), dpi = 300)
plt.figure(figsize=X_train.columns, filled=True, rounded=True)
plot_tree(tree_model, feature_names"Regression Tree for war (Initial Fit)")
plt.title( plt.show()
Training MSE: 0.092
Test MSE: 6.154
# In scikit-learn, we can use min_impurity_decrease=0.005 for a similar effect.
= DecisionTreeRegressor(max_depth=3, min_impurity_decrease=0.005, random_state=42)
tree_model # Fit the model using all predictors (all columns except 'medv')
tree_model.fit(X_train, y_train)
# Predict on training and test sets
= tree_model.predict(X_train)
y_train_pred = tree_model.predict(X_test)
y_test_pred
# Calculate MSE
= mean_squared_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_test, y_test_pred)
mse_test
# Print the results
print(f"Training MSE: {mse_train:.3f}")
print(f"Test MSE: {mse_test:.3f}")
# Plot the initial regression tree
=(12, 8), dpi = 300)
plt.figure(figsize=X_train.columns, filled=True, rounded=True)
plot_tree(tree_model, feature_names"Regression Tree for war (Initial Fit)")
plt.title( plt.show()
Training MSE: 3.61641995
Test MSE: 7.20819639
Prunning
# Obtain the cost-complexity pruning path from the initial tree
= tree_model.cost_complexity_pruning_path(X_train, y_train) # Get candidate ccp_alpha values and corresponding impurities
path = path.ccp_alphas # Candidate pruning parameters (alpha values)
ccp_alphas = path.impurities # Impurity values at each candidate alpha
impurities
# Exclude the maximum alpha value to avoid the trivial tree (a tree with only the root)
= ccp_alphas[:-1] # Remove the last alpha value which would prune the tree to a single node
ccp_alphas
# Set up 10-fold cross-validation
= KFold(n_splits=10, shuffle=True, random_state=42) # Initialize 10-fold CV with shuffling and fixed random state
kf = [] # List to store mean cross-validated scores (negative MSE)
cv_scores = [] # List to record the number of leaves for each pruned tree
leaf_nodes = [] # List to record the sum of squared errors (SSE) on the training set
sse
# Loop over each candidate alpha value to evaluate its performance
for ccp_alpha in ccp_alphas:
# Create a DecisionTreeRegressor with the current ccp_alpha and other specified parameters
= DecisionTreeRegressor(random_state=42,
clf =ccp_alpha,
ccp_alpha=0.005)
min_impurity_decrease
# Perform 10-fold cross-validation and compute negative mean squared error (MSE)
= cross_val_score(clf, X_train, y_train,
scores =kf, scoring="neg_mean_squared_error")
cv# Append the mean CV score for the current alpha
cv_scores.append(np.mean(scores))
# Fit the tree on the training data to record additional metrics
clf.fit(X_train, y_train)# Record the number of leaf nodes in the tree
leaf_nodes.append(clf.get_n_leaves())
# Compute SSE (sum of squared errors) on the training set
= clf.predict(X_train) # Predict target values on training data
preds sum((y_train - preds) ** 2)) # Calculate and record SSE for training set
sse.append(np.
# Select the best alpha based on the highest (least negative) mean CV score
= ccp_alphas[np.argmax(cv_scores)] # Identify the alpha with the best CV performance
best_alpha print("Best alpha:", best_alpha) # Print the best alpha value
# Train the final pruned tree using the best alpha found
= DecisionTreeRegressor(random_state=42,
final_tree =best_alpha,
ccp_alpha=0.005)
min_impurity_decrease# Fit the final model on the training data
final_tree.fit(X_train, y_train)
len(ccp_alphas)
= final_tree.predict(train.drop(columns=["war"]))
preds_train = mean_squared_error(y_train, preds_train)
mse_train print("Train MSE:", mse_train)
= final_tree.predict(test.drop(columns=["war"]))
preds = mean_squared_error(y_test, preds)
mse print("Test MSE:", mse)
# Plot the pruned tree.
=(16, 12), dpi=300)
plt.figure(figsize=X_train.columns, filled=True, rounded=True)
plot_tree(final_tree, feature_names"Pruned Regression Tree for medv")
plt.title(
plt.show()
# Summary of the final tree
print("Number of leaves in the pruned tree:", final_tree.get_n_leaves())
print("Tree depth:", final_tree.get_depth())
Best alpha: 0.012181159420289953
Train MSE: 0.21034767647811128
Test MSE: 5.97171015229301
Number of leaves in the pruned tree: 47
Tree depth: 9
# Plot the average cross-validated MSE against the number of leaf nodes
= -np.array(cv_scores)
negative_cv_scores
=(8, 6), dpi=150)
plt.figure(figsize='o', linestyle='-')
plt.plot(leaf_nodes, negative_cv_scores, marker=final_tree.get_n_leaves(), color='red', linestyle='--', label='Leaf Nodes = 21') # Add vertical line at 21 leaf nodes
plt.axvline(x"Number of Leaf Nodes")
plt.xlabel("Mean CV MSE")
plt.ylabel("CV Error vs. Number of Leaf Nodes")
plt.title(True)
plt.grid( plt.show()
# Plot the SSE on the training against the number of leaf nodes
=(8, 6), dpi=150)
plt.figure(figsize='o', linestyle='-')
plt.plot(leaf_nodes, sse, marker"Number of Leaf Nodes")
plt.xlabel("SSE")
plt.ylabel("SSE vs. Number of Leaf Nodes")
plt.title(True)
plt.grid( plt.show()
Random Forest
# Build the Random Forest model
# max_features=13 means that at each split the algorithm randomly considers 13 predictors.
= RandomForestRegressor(max_features=13, # Use 13 features at each split
rf =500, # Number of trees in the forest
n_estimators=42,
random_state=True) # Use out-of-bag samples to estimate error
oob_score
rf.fit(X_train, y_train)
# Print the model details
print("Random Forest Model:")
print(rf)
# Output the model details (feature importances, OOB score, etc.)
print("Out-of-bag score:", rf.oob_score_) # A rough estimate of generalization error
# Generate predictions on training and testing sets
= rf.predict(X_train)
y_train_pred = rf.predict(X_test)
y_test_pred
# Calculate Mean Squared Errors (MSE) for both sets
= mean_squared_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_test, y_test_pred)
test_mse print("Train MSE:", train_mse)
print("Test MSE:", test_mse)
# Optional: Plot predicted vs. observed values for test data
# plt.figure(figsize=(8,6), dpi=300)
# plt.scatter(y_test, y_test_pred, alpha=0.7)
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
# plt.xlabel("Observed medv")
# plt.ylabel("Predicted medv")
# plt.title("Random Forest: Observed vs. Predicted Values")
# plt.show()
Random Forest Model:
RandomForestRegressor(max_features=13, n_estimators=500, oob_score=True,
random_state=42)
Out-of-bag score: 0.8998284519241363
Train MSE: 0.3287167456521755
Test MSE: 3.3319365208695935
# Get feature importances from the model (equivalent to importance(bag.boston) in R)
= rf.feature_importances_
importances = X_train.columns
feature_names
print("Feature Importances:")
for name, imp in zip(feature_names, importances):
print(f"{name}: {imp:.4f}")
# Plot the feature importances, similar to varImpPlot(bag.boston) in R
# Sort the features by importance for a nicer plot.
= np.argsort(importances)[::-1]
indices
=(10, 6), dpi=150)
plt.figure(figsize"Feature Importances")
plt.title(range(len(feature_names)), importances[indices], align='center')
plt.bar(range(len(feature_names)), feature_names[indices], rotation=90)
plt.xticks("Features")
plt.xlabel("Importance")
plt.ylabel(
plt.tight_layout() plt.show()
Feature Importances:
g: 0.0062
pa: 0.0164
hr: 0.0086
r: 0.2313
rbi: 0.0158
sb: 0.0076
bb_percent: 0.0059
k_percent: 0.0057
iso: 0.0049
babip: 0.0055
avg: 0.0062
obp: 0.0084
slg: 0.0087
w_oba: 0.0140
w_rc: 0.0430
bs_r: 0.0069
off: 0.5021
def: 0.1028
XGBoost
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import PartialDependenceDisplay
# Define the grid of hyperparameters
= {
param_grid "n_estimators": list(range(20, 201, 20)), # nrounds: 20, 40, ..., 200
"learning_rate": [0.025, 0.05, 0.1, 0.3], # eta
"gamma": [0], # gamma
"max_depth": [1, 2, 3, 4],
"colsample_bytree": [1],
"min_child_weight": [1],
"subsample": [1]
}
# Initialize the XGBRegressor with the regression objective and fixed random state for reproducibility
= XGBRegressor(objective="reg:squarederror", random_state=1937, verbosity=1)
xgb_reg
# Set up GridSearchCV with 10-fold cross-validation; scoring is negative MSE
= GridSearchCV(
grid_search =xgb_reg,
estimator=param_grid,
param_grid="neg_mean_squared_error",
scoring=10,
cv=1 # Adjust verbosity as needed
verbose
)
# Fit the grid search
grid_search.fit(X_train, y_train)
# Train the final model using the best parameters (grid_search.best_estimator_ is already refit on entire data)
= grid_search.best_estimator_
final_model
# Plot variable importance using XGBoost's plot_importance function
=(10, 8))
plt.figure(figsize
plot_importance(final_model)"Feature Importance")
plt.title(
plt.show()
# Calculate MSE on the training data
= final_model.predict(X_train)
y_pred_train = mean_squared_error(y_train, y_pred_train)
train_mse print("Train MSE:", train_mse)
# Calculate MSE on the test data
= final_model.predict(X_test)
y_pred = mean_squared_error(y_test, y_pred)
test_mse print("Test MSE:", test_mse)
# Print the best parameters found by GridSearchCV
= grid_search.best_params_
best_params print("Best parameters:", best_params)
Fitting 10 folds for each of 160 candidates, totalling 1600 fits
Train MSE: 0.12103999913356943
Test MSE: 0.7061771964936895
Best parameters: {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.3, 'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}
<Figure size 1000x800 with 0 Axes>
# Plot partial dependence for the predictor 'off'
= mlb_battings_2024.drop(columns=["war"]).columns.tolist()
feature_names = feature_names.index("off")
off_index =[off_index],
PartialDependenceDisplay.from_estimator(final_model, X_train, features=feature_names, kind="average")
feature_names"Partial Dependence for 'off'")
plt.title( plt.show()
Predictive performance and Overffiting
Train vs. Test MSE for Tree‑Based Models
Model | Train MSE | Test MSE |
---|---|---|
Regression tree (no depth limit) | 0.092 | 6.154 |
Regression tree (max_depth = 3) | 3.616 | 7.208 |
CV‑pruned tree | 0.210 | 5.972 |
Random forest | 0.329 | 3.332 |
Gradient boosted tree | 0.120 | 0.706 |
Predictive Performance Comparison
- Unrestricted Regression Tree
- Train MSE: 0.092 (very low)
- Test MSE: 6.154 (high)
- Interpretation: Severe overfitting—model fits noise in training data but generalizes poorly.
- Train MSE: 0.092 (very low)
- Shallow Tree (max_depth = 3)
- Train MSE: 3.616
- Test MSE: 7.208 (highest)
- Interpretation: Underfitting—model is too simple, yielding both high training and test errors.
- Train MSE: 3.616
- CV‑Pruned Tree
- Train MSE: 0.210
- Test MSE: 5.972
- Interpretation: Pruning reduces complexity relative to the unrestricted tree, lowering the gap between train and test error but still overfits somewhat.
- Train MSE: 0.210
- Random Forest
- Train MSE: 0.329
- Test MSE: 3.332
- Interpretation: Bagging of many trees greatly reduces variance and overfitting compared to single trees, cutting test error by nearly half.
- Train MSE: 0.329
- Gradient Boosted Tree
- Train MSE: 0.120
- Test MSE: 0.706 (lowest)
- Interpretation: Sequential boosting captures complex patterns with controlled regularization, achieving the best predictive performance and modest overfit.
- Train MSE: 0.120
Overfitting Analysis
Model | Train MSE | Test MSE | Abs. Difference (Test – Train) |
---|---|---|---|
Regression tree (no depth limit) | 0.092 | 6.154 | 6.062 |
Regression tree (max_depth = 3) | 3.616 | 7.208 | 3.592 |
CV‑pruned tree | 0.210 | 5.972 | 5.762 |
Random forest | 0.329 | 3.332 | 3.003 |
Gradient boosted tree | 0.120 | 0.706 | 0.586 |
- Overfit severity can be gauged by the gap between Train and Test MSE:
- Unrestricted tree: gap ≈ 6.062 → very high overfit
- Pruned tree: gap ≈ 5.762 → still substantial overfit
- Random forest: gap ≈ 3.003 → moderate overfit
- Gradient boosted tree: gap ≈ 0.586 → low overfit
- Unrestricted tree: gap ≈ 6.062 → very high overfit
- Underfitting is evident in the shallow tree (max_depth = 3), which shows both high train and test error.
Overfitting Summary:
- Single trees either overfit (no limit) or underfit (too shallow).
- Pruning helps but does not eliminate overfitting entirely.
- Ensembles (random forest and boosting), particularly boosting, shows a minimal overfit on the test set.