Classwork 10

Logistic Regression

Author

Byeong-Hak Choe

Published

March 12, 2025

Modified

March 12, 2025

Setup for PySpark, UDFs, and Plots

Required Libraries and SparkSession Entry Point

# Below is for an interactive display of Pandas DataFrame in Colab
from google.colab import data_table
data_table.enable_dataframe_formatter()

import pandas as pd
import numpy as np
from tabulate import tabulate  # for table summary
import scipy.stats as stats
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm  # for lowess smoothing
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve

from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, col, pow, mean, avg, when, log, sqrt, exp
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.master("local[*]").getOrCreate()


UDF for Adding Dummy Variables

def add_dummy_variables(var_name, reference_level, category_order=None):
    """
    Creates dummy variables for the specified column in the global DataFrames dtrain and dtest.
    Allows manual setting of category order.

    Parameters:
        var_name (str): The name of the categorical column (e.g., "borough_name").
        reference_level (int): Index of the category to be used as the reference (dummy omitted).
        category_order (list, optional): List of categories in the desired order. If None, categories are sorted.

    Returns:
        dummy_cols (list): List of dummy column names excluding the reference category.
        ref_category (str): The category chosen as the reference.
    """
    global dtrain, dtest

    # Get distinct categories from the training set.
    categories = dtrain.select(var_name).distinct().rdd.flatMap(lambda x: x).collect()

    # Convert booleans to strings if present.
    categories = [str(c) if isinstance(c, bool) else c for c in categories]

    # Use manual category order if provided; otherwise, sort categories.
    if category_order:
        # Ensure all categories are present in the user-defined order
        missing = set(categories) - set(category_order)
        if missing:
            raise ValueError(f"These categories are missing from your custom order: {missing}")
        categories = category_order
    else:
        categories = sorted(categories)

    # Validate reference_level
    if reference_level < 0 or reference_level >= len(categories):
        raise ValueError(f"reference_level must be between 0 and {len(categories) - 1}")

    # Define the reference category
    ref_category = categories[reference_level]
    print("Reference category (dummy omitted):", ref_category)

    # Create dummy variables for all categories
    for cat in categories:
        dummy_col_name = var_name + "_" + str(cat).replace(" ", "_")
        dtrain = dtrain.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))
        dtest = dtest.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))

    # List of dummy columns, excluding the reference category
    dummy_cols = [var_name + "_" + str(cat).replace(" ", "_") for cat in categories if cat != ref_category]

    return dummy_cols, ref_category


# Example usage without category_order:
# dummy_cols_year, ref_category_year = add_dummy_variables('year', 0)

# Example usage with category_order:
# custom_order_wkday = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
# dummy_cols_wkday, ref_category_wkday = add_dummy_variables('wkday', reference_level=0, category_order = custom_order_wkday)

Marginal Effect Plot

# Increase figure size to prevent overlapping
plt.figure(figsize=(10, 6))

# Plot using the DataFrame columns
plt.errorbar(df_ME["Variable"], df_ME["Marginal Effect"],
             yerr=1.96 * df_ME["Std. Error"], fmt='o', capsize=5)

# Labels and title
plt.xlabel("Terms")
plt.ylabel("Marginal Effect")
plt.title("Marginal Effect at the Mean")

# Add horizontal line at 0 for reference
plt.axhline(0, color="red", linestyle="--")

# Adjust x-axis labels to avoid overlap
plt.xticks(rotation=45, ha="right")  # Rotate and align labels to the right
plt.tight_layout()  # Adjust layout to prevent overlap

# Show plot
plt.show()



Question 1

dfpd = pd.read_csv('https://bcdanl.github.io/data/NatalRiskData.csv')
  • Convert the dfpd Pandas DataFrame into the PySpark DataFrame object with the name, df.


Variable description

Variable Type Description
atRisk Boolean TRUE (1) if 5-minute Apgar score < 7; FALSE (0) otherwise
PWGT Numeric Mother’s prepregnancy weight
UPREVIS Numeric (integer) Number of prenatal medical visits
CIG_REC Boolean TRUE (1) if smoker; FALSE (0) otherwise
GESTREC3 Categorical Two categories: <37 weeks (premature) and >=37 weeks
DPLURAL Categorical Birth plurality, three categories: single/twin/triplet+
ULD_MECO Boolean TRUE (1) if moderate/heavy fecal staining of amniotic fluid
ULD_PRECIP Boolean TRUE (1) for unusually short labor (< three hours)
ULD_BREECH Boolean TRUE (1) for breech (pelvis first) birth position
URF_DIAB Boolean TRUE (1) if mother is diabetic
URF_CHYPER Boolean TRUE (1) if mother has chronic hypertension
URF_PHYPER Boolean TRUE (1) if mother has pregnancy-related hypertension
URF_ECLAM Boolean TRUE (1) if mother experienced eclampsia: pregnancy-related seizures



Question 2

  • Divide the df DataFrame into training and test DataFrames.
    • Use dtrain and dtest for training and test DataFrames, respectively.
    • 50% of observations in the df are assigned to dtrain; the rest is assigned to dtest.



Question 3

Fit the following regression model:

Prob(atRiski=1)=G(β0+β1PWGTi+β2UPREVISi+β3CIG_RECi+β4ULD_MECOi+β5ULD_PRECIPi+β6ULD_BREECHi+β7URF_DIABi+β8URF_CHYPERi+β9URF_PHYPERi+β10URF_ECLAMi+β11GESTREC3_<_37_weeksi+β12DPLURAL_triplet_or_higheri+β13DPLURAL_twini),

where G() is

G()=exp()1+exp().

Provide the summary of the regression result.

  • Set “>= 37 weeks” as the reference level for the GESTREC3 variable.
  • Set “single” as the reference level for the DPLURAL variable.


Question 4

  • Calculate the marginal effect of each variable for an average pregnant woman.
  • Calculate the marginal effect of each variable for an average pregnant woman who smokes.


Question 5

  • Calculate the followings:
    • Confusion matrix with the threshold level 0.02
    • Accuracy
    • Precision
    • Recall
    • Specificity
    • Average rate of at-risk babies
    • Enrichment


Question 6

Visualize the variation in recall and enrichment across different threshold levels.


Question 7

  • Draw the receiver operating characteristic (ROC) curve.
  • Calculate the area under the curve (AUC).


Question 8

  • Now, suppose you are deploying the classifier in real-world scenarios. Assess its performance using the following datasets:
    • MA hospital data with a higher average rate of at-risk babies than NY hospital (dtest_MA_moreRisk)
    • MA hospital data with a lower average rate of at-risk babies than NY hospital (dtest_MA_lessRisk)
pd_dtest = dtest.toPandas()

# Set seed for reproducibility
np.random.seed(23464)

# Sample 1000 random indices from the test dataset without replacement
sample_indices = np.random.choice(pd_dtest.index, size=1000, replace=False)

# Separate the selected observations from testing data
separated = pd_dtest.loc[sample_indices]

# Remove the selected observations from the testing data
# Consider this as data from NY hospitals
pd_dtest_NY = pd_dtest.drop(sample_indices)

# Split the separated sample into at-risk and not-at-risk groups
at_risk_sample = separated[separated["atRisk"] == 1]  # Only at-risk cases
not_at_risk_sample = separated[separated["atRisk"] == 0]  # Only not-at-risk cases

# Create test sets for MA hospitals with different at-risk average rates compared to NY
pd_dtest_MA_moreRisk = pd.concat([pd_dtest_NY, at_risk_sample])  # Adds back only at-risk cases
pd_dtest_MA_lessRisk = pd.concat([pd_dtest_NY, not_at_risk_sample])  # Adds back only not-at-risk cases

# Show counts to verify results
print("Original Test Set Size:", pd_dtest.shape[0])
print("Sampled Separated Size:", separated.shape[0])
print("NY Hospitals Data Size:", pd_dtest_NY.shape[0])
print("MA More Risk Data Size:", pd_dtest_MA_moreRisk.shape[0])
print("MA Less Risk Data Size:", pd_dtest_MA_lessRisk.shape[0])

dtest_MA_moreRisk = spark.createDataFrame(pd_dtest_MA_moreRisk)
dtest_MA_lessRisk = spark.createDataFrame(pd_dtest_MA_lessRisk)
  • For MA hospitals, compute the percentage change in each of the following metrics relative to NY hospitals:
    1. Average Rate
    2. Accuracy
    3. Precision
    4. Recall
    5. False Negative Rate
    6. Specificity
    7. False Positive Rate
    8. Enrichment
    9. AUC
  • Which metric does vary a lot with average rate? Why?


Back to top