# Below is for an interactive display of Pandas DataFrame in Colabfrom google.colab import data_tabledata_table.enable_dataframe_formatter()import pandas as pdimport numpy as npfrom tabulate import tabulate # for table summaryimport scipy.stats as statsfrom scipy.stats import normimport matplotlib.pyplot as pltimport seaborn as snsimport statsmodels.api as sm # for lowess smoothingfrom sklearn.metrics import precision_recall_curvefrom sklearn.metrics import roc_curvefrom pyspark.sql import SparkSessionfrom pyspark.sql.functions import rand, col, pow, mean, avg, when, log, sqrt, expfrom pyspark.ml.feature import VectorAssemblerfrom pyspark.ml.regression import LinearRegression, GeneralizedLinearRegressionfrom pyspark.ml.evaluation import BinaryClassificationEvaluatorspark = SparkSession.builder.master("local[*]").getOrCreate()
UDF for Adding Dummy Variables
def add_dummy_variables(var_name, reference_level, category_order=None):""" Creates dummy variables for the specified column in the global DataFrames dtrain and dtest. Allows manual setting of category order. Parameters: var_name (str): The name of the categorical column (e.g., "borough_name"). reference_level (int): Index of the category to be used as the reference (dummy omitted). category_order (list, optional): List of categories in the desired order. If None, categories are sorted. Returns: dummy_cols (list): List of dummy column names excluding the reference category. ref_category (str): The category chosen as the reference. """global dtrain, dtest# Get distinct categories from the training set. categories = dtrain.select(var_name).distinct().rdd.flatMap(lambda x: x).collect()# Convert booleans to strings if present. categories = [str(c) ifisinstance(c, bool) else c for c in categories]# Use manual category order if provided; otherwise, sort categories.if category_order:# Ensure all categories are present in the user-defined order missing =set(categories) -set(category_order)if missing:raiseValueError(f"These categories are missing from your custom order: {missing}") categories = category_orderelse: categories =sorted(categories)# Validate reference_levelif reference_level <0or reference_level >=len(categories):raiseValueError(f"reference_level must be between 0 and {len(categories) -1}")# Define the reference category ref_category = categories[reference_level]print("Reference category (dummy omitted):", ref_category)# Create dummy variables for all categoriesfor cat in categories: dummy_col_name = var_name +"_"+str(cat).replace(" ", "_") dtrain = dtrain.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0)) dtest = dtest.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))# List of dummy columns, excluding the reference category dummy_cols = [var_name +"_"+str(cat).replace(" ", "_") for cat in categories if cat != ref_category]return dummy_cols, ref_category# Example usage without category_order:# dummy_cols_year, ref_category_year = add_dummy_variables('year', 0)# Example usage with category_order:# custom_order_wkday = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']# dummy_cols_wkday, ref_category_wkday = add_dummy_variables('wkday', reference_level=0, category_order = custom_order_wkday)
Marginal Effect Plot
# Increase figure size to prevent overlappingplt.figure(figsize=(10, 6))# Plot using the DataFrame columnsplt.errorbar(df_ME["Variable"], df_ME["Marginal Effect"], yerr=1.96* df_ME["Std. Error"], fmt='o', capsize=5)# Labels and titleplt.xlabel("Terms")plt.ylabel("Marginal Effect")plt.title("Marginal Effect at the Mean")# Add horizontal line at 0 for referenceplt.axhline(0, color="red", linestyle="--")# Adjust x-axis labels to avoid overlapplt.xticks(rotation=45, ha="right") # Rotate and align labels to the rightplt.tight_layout() # Adjust layout to prevent overlap# Show plotplt.show()
Two categories: <37 weeks (premature) and >=37 weeks
DPLURAL
Categorical
Birth plurality, three categories: single/twin/triplet+
ULD_MECO
Boolean
TRUE (1) if moderate/heavy fecal staining of amniotic fluid
ULD_PRECIP
Boolean
TRUE (1) for unusually short labor (< three hours)
ULD_BREECH
Boolean
TRUE (1) for breech (pelvis first) birth position
URF_DIAB
Boolean
TRUE (1) if mother is diabetic
URF_CHYPER
Boolean
TRUE (1) if mother has chronic hypertension
URF_PHYPER
Boolean
TRUE (1) if mother has pregnancy-related hypertension
URF_ECLAM
Boolean
TRUE (1) if mother experienced eclampsia: pregnancy-related seizures
Question 2
Divide the df DataFrame into training and test DataFrames.
Use dtrain and dtest for training and test DataFrames, respectively.
50% of observations in the df are assigned to dtrain; the rest is assigned to dtest.
Question 3
Fit the following regression model:
where is
Provide the summary of the regression result.
Set “>= 37 weeks” as the reference level for the variable.
Set “single” as the reference level for the variable.
Question 4
Calculate the marginal effect of each variable for an average pregnant woman.
Calculate the marginal effect of each variable for an average pregnant woman who smokes.
Question 5
Calculate the followings:
Confusion matrix with the threshold level 0.02
Accuracy
Precision
Recall
Specificity
Average rate of at-risk babies
Enrichment
Question 6
Visualize the variation in recall and enrichment across different threshold levels.
Question 7
Draw the receiver operating characteristic (ROC) curve.
Calculate the area under the curve (AUC).
Question 8
Now, suppose you are deploying the classifier in real-world scenarios. Assess its performance using the following datasets:
MA hospital data with a higher average rate of at-risk babies than NY hospital (dtest_MA_moreRisk)
MA hospital data with a lower average rate of at-risk babies than NY hospital (dtest_MA_lessRisk)
pd_dtest = dtest.toPandas()# Set seed for reproducibilitynp.random.seed(23464)# Sample 1000 random indices from the test dataset without replacementsample_indices = np.random.choice(pd_dtest.index, size=1000, replace=False)# Separate the selected observations from testing dataseparated = pd_dtest.loc[sample_indices]# Remove the selected observations from the testing data# Consider this as data from NY hospitalspd_dtest_NY = pd_dtest.drop(sample_indices)# Split the separated sample into at-risk and not-at-risk groupsat_risk_sample = separated[separated["atRisk"] ==1] # Only at-risk casesnot_at_risk_sample = separated[separated["atRisk"] ==0] # Only not-at-risk cases# Create test sets for MA hospitals with different at-risk average rates compared to NYpd_dtest_MA_moreRisk = pd.concat([pd_dtest_NY, at_risk_sample]) # Adds back only at-risk casespd_dtest_MA_lessRisk = pd.concat([pd_dtest_NY, not_at_risk_sample]) # Adds back only not-at-risk cases# Show counts to verify resultsprint("Original Test Set Size:", pd_dtest.shape[0])print("Sampled Separated Size:", separated.shape[0])print("NY Hospitals Data Size:", pd_dtest_NY.shape[0])print("MA More Risk Data Size:", pd_dtest_MA_moreRisk.shape[0])print("MA Less Risk Data Size:", pd_dtest_MA_lessRisk.shape[0])dtest_MA_moreRisk = spark.createDataFrame(pd_dtest_MA_moreRisk)dtest_MA_lessRisk = spark.createDataFrame(pd_dtest_MA_lessRisk)
For MA hospitals, compute the percentage change in each of the following metrics relative to NY hospitals:
Average Rate
Accuracy
Precision
Recall
False Negative Rate
Specificity
False Positive Rate
Enrichment
AUC
Which metric does vary a lot with average rate? Why?