import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config Completer.use_jedi = False
RAND_SEED = 101


df = pd.read_csv('./data/Telco_customer_churn.csv')
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  PaymentMethod     7032 non-null   object 
 18  MonthlyCharges    7032 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB


# get descriptive stats for continuous (numerical) data
df.describe()


# check the unique values of the 'SeniorCitizen' feature
df['SeniorCitizen'].unique()

array([0, 1], dtype=int64)


# confirm that there are no NaN cells by displaying NaN values per feature column
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


df['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64


sns.countplot(data=df, x='Churn');


sns.violinplot(x='Churn', y='TotalCharges', data=df);


sns.displot(data=df, x='tenure', hue='Churn', bins=72, kde=True);


plt.figure(figsize=(12,8), dpi=200)
sns.boxplot(x='Contract', y='TotalCharges', data=df, hue='Churn')
plt.legend(loc=(1.1, 0.5))
plt.title('Churn Groups by Contract')
plt.show()


# check the columns in our data again
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


# create a DataFrame with just the categorical features and churn
df_cat = df[['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 
 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'InternetService',
   'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']]


# compute the correlation of these features and specifically highlight the correlation with CHURN 
df_dum = pd.get_dummies(df_cat, drop_first=False)

# NOTE: we skip index 0 and -1 because they correspond to 'churn_no' and 'churn_yes' respectively
feature_correlation = df_dum.corr()['Churn_Yes'].sort_values()[1:-1]


# we can see how Pandas created dummy variables for the categorical data here
feature_correlation.index

Index(['Contract_Two year', 'DeviceProtection_No internet service',
       'StreamingMovies_No internet service',
       'StreamingTV_No internet service', 'InternetService_No',
       'TechSupport_No internet service', 'OnlineBackup_No internet service',
       'OnlineSecurity_No internet service', 'PaperlessBilling_No',
       'Contract_One year', 'OnlineSecurity_Yes', 'TechSupport_Yes',
       'Dependents_Yes', 'Partner_Yes',
       'PaymentMethod_Credit card (automatic)', 'InternetService_DSL',
       'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Mailed check',
       'OnlineBackup_Yes', 'DeviceProtection_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'PhoneService_No', 'gender_Male',
       'gender_Female', 'PhoneService_Yes', 'MultipleLines_Yes',
       'StreamingMovies_Yes', 'StreamingTV_Yes', 'StreamingTV_No',
       'StreamingMovies_No', 'Partner_No', 'SeniorCitizen', 'Dependents_No',
       'PaperlessBilling_Yes', 'DeviceProtection_No', 'OnlineBackup_No',
       'PaymentMethod_Electronic check', 'InternetService_Fiber optic',
       'TechSupport_No', 'OnlineSecurity_No', 'Contract_Month-to-month'],
      dtype='object')


# store the correlations in a new DataFrame
df_corr = pd.DataFrame(data=feature_correlation)
df_corr


plt.figure(figsize=(12,8), dpi=200)
sns.barplot(x=df_corr.index, y='Churn_Yes', data=df_corr)
plt.xticks(rotation=90)
plt.title("Feature Correlation to Customer Churn")
plt.tight_layout()
plt.show()


print("These are the 5 features most positively correlated to customer churn: ")
df_corr['Churn_Yes'].nlargest(5)

These are the 5 features most positively correlated to customer churn:

Contract_Month-to-month           0.404565
OnlineSecurity_No                 0.342235
TechSupport_No                    0.336877
InternetService_Fiber optic       0.307463
PaymentMethod_Electronic check    0.301455
Name: Churn_Yes, dtype: float64


print("These are the 5 features most negatively correlated to customer churn: ")
df_corr['Churn_Yes'].nsmallest(5)

These are the 5 features most negatively correlated to customer churn:

Contract_Two year                      -0.301552
DeviceProtection_No internet service   -0.227578
StreamingMovies_No internet service    -0.227578
StreamingTV_No internet service        -0.227578
InternetService_No                     -0.227578
Name: Churn_Yes, dtype: float64


# contract types available
df['Contract'].unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)


# tenure is the number of months the customer has used the service
df['tenure'].sort_values().unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72], dtype=int64)


# the range of contracts in this data set is from 1 month to 72 months
print("Tenure Range: [", df['tenure'].min(), ", ", df['tenure'].max(), "] months")

Tenure Range: [ 1 ,  72 ] months


plt.figure(figsize=(6,4), dpi=200)
sns.histplot(data=df, x='tenure', bins=72)
plt.xlabel('Customer Tenure (Months)')
plt.xlim(1,72)
plt.show()


sns.displot(data=df,
            x='tenure',
            bins=72,
            col='Contract',
            row='Churn',
            kde=True);


plt.figure(figsize=(10,6), dpi=200)
sns.scatterplot(x='MonthlyCharges', y='TotalCharges', data=df,
                hue='Churn', 
                style='Churn', 
                alpha=0.5, 
                linewidth=0.3, 
                palette='tab10')

plt.title('Analysis of Customer Charges')
plt.xlabel('Monthly Charges (\\$)')
plt.ylabel('Total Charges (\\$)')

plt.show()


# first we will map the churn yes and no values to 1 and 0 (resp) to help compute churn rate by cohort more conveniently:
df['Churn_map'] = df['Churn'].map({'Yes': 1, 'No': 0})


# group by tenure and compute how many customers have churned for that tenure (i.e., # of churns for a single month group)
monthly_churn = df.groupby('tenure').sum()['Churn_map']
monthly_churn

tenure
1     380
2     123
3      94
4      83
5      64
     ... 
68      9
69      8
70     11
71      6
72      6
Name: Churn_map, Length: 72, dtype: int64


# group by tenure and compute how many customers belong to that cohort (e.g., how many customers have a tenure of X months)
total_customers = df.groupby('tenure').count()['Churn_map']
total_customers

tenure
1     613
2     238
3     200
4     176
5     133
     ... 
68    100
69     95
70    119
71    170
72    362
Name: Churn_map, Length: 72, dtype: int64


# create a new DataFrame for the cohort analysis and compute the churn rate based on monthly tenure:
df_cohort = pd.concat([total_customers, monthly_churn], axis=1)
df_cohort.columns = ['total', 'churned']
df_cohort['churn_percent'] = round(100.0 * (df_cohort['churned'] / df_cohort['total']), 2)
df_cohort


# plot churn rate vs months of tenure
x_month = df_cohort.index
y_churn = df_cohort['churn_percent']


plt.figure(figsize=(8,4), dpi=100)

plt.plot(x_month, y_churn, 'k--')
plt.xlabel('Customer Tenure (Months)')
plt.ylabel('Churn Rate (Percent)')
plt.title('Churn Rate Grouped by Customer Cohort')
plt.xlim(1,72)

plt.show()


# create a function to apply cohort labels based on tenure (in months):
def get_cohort(tenure):
    if tenure < 1:
        print('***error: cohort not defined (make sure tenure is defined in MONTHS and >= 1)')
        return 'NO COHORT'
    elif tenure < 13:
        return '1-12 Months'     # Cohort A
    elif tenure < 25:
        return '13-24 Months'    # Cohort B
    elif tenure < 49:
        return '25-48 Months'    # Cohort C
    else:
        return 'Over 48 Months'  # Cohort D


# apply the cohort sorting function
df['Tenure Cohort'] = df['tenure'].apply(get_cohort)


plt.figure(figsize=(8,4), dpi=200)
sns.scatterplot(x='MonthlyCharges', y='TotalCharges', data=df,
                hue='Tenure Cohort', 
#                 hue='Churn', 
                style='Tenure Cohort',
#                 style='Churn',
                alpha=0.5, 
                linewidth=0.2, 
                palette='bright')

plt.xlabel('Monthly Charges (\\$)')
plt.ylabel('Total Charges (\\$)')
plt.title('Analysis of Customer Charges by Cohort')

plt.tight_layout()
plt.savefig('charges_by_cohort.png', dpi=200)

plt.show()


plt.figure(figsize=(8,4), dpi=200)
sns.countplot(x='Tenure Cohort', data=df, hue='Churn')
plt.show()


sns.catplot(x='Tenure Cohort', data=df, hue='Churn',
            kind='count', col='Contract');


# CHECKPOINT
df_churn = df.copy()


df = df_churn.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  PaymentMethod     7032 non-null   object 
 18  MonthlyCharges    7032 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7032 non-null   object 
 21  Churn_map         7032 non-null   int64  
 22  Tenure Cohort     7032 non-null   object 
dtypes: float64(2), int64(3), object(18)
memory usage: 1.2+ MB


# we can drop the "Churn_map" and "customerID" features:
df = df.drop(['customerID', 'Churn_map', 'Tenure Cohort'], axis=1)


# split the DataFrame into categorical and numerical subsets 
df_categorical = df.select_dtypes(include='object')
df_numerical = df.select_dtypes(exclude='object')


# generate dummy variables for the categorical data 
df_dummies = pd.get_dummies(df_categorical, drop_first=True)
df_dummies


# merge the numerical data with the new dummy variables
df_new = pd.concat([df_numerical, df_dummies], axis=1)
df_new.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'Churn_Yes'],
      dtype='object')


# separate the data into features (X) and targets (y)
X = df_new.drop('Churn_Yes', axis=1)
y = df_new['Churn_Yes']


# split the data in training and testing (withhold 10% for testing)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=RAND_SEED)


# number of observations in each set:
n_train = len(X_train)
n_test = len(X_test) 

print( "Number of Training Observations: ", n_train )
print( "Number of Testing Observations:  ", n_test  )

Number of Training Observations:  6328
Number of Testing Observations:   704


# how many people churned in each data set?
n_churn_train = y_train.sum()
n_churn_test = y_test.sum()

perc_churn_train = round(100.0 * n_churn_train / n_train, 1)
perc_churn_test = round(100.0 * n_churn_test / n_test, 1)

print( "Churn Count in Training Observations: {n} ({p}%)".format(n=n_churn_train, p=perc_churn_train)  )
print( "Churn Count in Testing Observations:  {n} ({p}%)".format(n=n_churn_test,  p=perc_churn_test)   )

Churn Count in Training Observations: 1722 (27.2%)
Churn Count in Testing Observations:  147 (20.9%)


# split the data into customers who churned and those who did not
df_yes = df_new[ df_new['Churn_Yes'] == 1]
df_no = df_new[ df_new['Churn_Yes'] == 0]

# what percent of customers actually churned?
churn_rate = 100 * len(df_yes) / len(df_new)
churn_rate = round(churn_rate, 1)
print("Percentage of customers in the data set that actually churned: {p}%".format(p=churn_rate))

Percentage of customers in the data set that actually churned: 26.6%


# goal: 50-50 split 
#   ==> select a random sample the NO churn group such that we have an equal number of YES and NO churn customers
n_goal = df_new['Churn_Yes'].sum()
df_no_sample = df_no.sample(n=n_goal, axis=0, random_state=RAND_SEED)


# merge the sampled NO observations with the YES group 
df_temp = pd.concat([df_yes, df_no_sample], axis=0)
n_bal = len(df_temp)

# ... now re-shuffle the balanced data set such that the YES and NO churn customers are randomly mixed together
# (you can check this by looking at the "Churn_Yes" column where the 0's and 1's should be randomly distributed)
df_balanced = df_temp.sample(n=n_bal, axis=0, random_state=RAND_SEED)
df_balanced.reset_index(inplace=True, drop=True)
df_balanced['Churn_Yes']

0       1
1       0
2       1
3       0
4       0
       ..
3733    1
3734    1
3735    1
3736    1
3737    1
Name: Churn_Yes, Length: 3738, dtype: uint8


# check that our new balanced data set contains an equal number of YES and NO churn results
sns.countplot(data=df_balanced, x='Churn_Yes');


# perform the train test split again into features (X) and targets (y)
X = df_balanced.drop('Churn_Yes', axis=1)
y = df_balanced['Churn_Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=RAND_SEED)


# number of observations in each set:
n_train = len(X_train)
n_test = len(X_test) 

print( "Number of Training Observations: ", n_train )
print( "Number of Testing Observations:  ", n_test  )

Number of Training Observations:  3364
Number of Testing Observations:   374


# how many people churned in each data set?
n_churn_train = y_train.sum()
n_churn_test = y_test.sum()

perc_churn_train = round(100.0 * n_churn_train / n_train, 1)
perc_churn_test = round(100.0 * n_churn_test / n_test, 1)

print( "Churn Count in Training Observations: {n} ({p}%)".format(n=n_churn_train, p=perc_churn_train)  )
print( "Churn Count in Testing Observations:  {n} ({p}%)".format(n=n_churn_test,  p=perc_churn_test)   )

Churn Count in Training Observations: 1682 (50.0%)
Churn Count in Testing Observations:  187 (50.0%)


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, plot_confusion_matrix, accuracy_score
from sklearn.metrics import plot_precision_recall_curve, plot_roc_curve


# default model
sdt_model = DecisionTreeClassifier()
sdt_model.fit(X_train, y_train)
y_pred = sdt_model.predict(X_test)

# accuracy metrics
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", round(score,2), "\n")
plot_confusion_matrix(sdt_model, X_test, y_test);
print(classification_report(y_test, y_pred))

Accuracy Score: 0.66 

              precision    recall  f1-score   support

           0       0.67      0.64      0.65       187
           1       0.65      0.68      0.67       187

    accuracy                           0.66       374
   macro avg       0.66      0.66      0.66       374
weighted avg       0.66      0.66      0.66       374


# here we use the decision tree classifer with several different "max depths" from 2 to N and record the error

errors = []
N = 30

for n in range (2, N):
    model = DecisionTreeClassifier(max_depth=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    e_n = 1 - accuracy_score(y_test, y_pred)
    errors.append(e_n)


# plot the classification error versus max depth of the tree
plt.figure(figsize=(5,5), dpi=100)
plt.plot(range(2, N), errors, 'ko--')
plt.xlabel('Max Depth')
plt.ylabel('Classification Error')
plt.ylim(0, 0.4)
plt.xlim(1, N)
plt.show()


# find the minimum error:
max_depth = np.arange(2, N, 1)
min_index = np.argmin(errors)
opt_depth = max_depth[min_index]
print("Minimum Error = ", errors[min_index])
print("Optimal Depth = ", opt_depth)

Minimum Error =  0.22994652406417115
Optimal Depth =  4


# create a new decision tree model with the optimal max depth
sdt_model = DecisionTreeClassifier(max_depth=opt_depth)
sdt_model.fit(X_train, y_train)
y_pred = sdt_model.predict(X_test)
print(classification_report(y_test, y_pred))
plot_confusion_matrix(sdt_model, X_test, y_test);

              precision    recall  f1-score   support

           0       0.77      0.78      0.77       187
           1       0.77      0.76      0.77       187

    accuracy                           0.77       374
   macro avg       0.77      0.77      0.77       374
weighted avg       0.77      0.77      0.77       374


# we can see here that using a max tree depth of 4 really reduces the number of features used in the model as most
# of the feature importances drop to zero:
feat_imp = pd.DataFrame(data=sdt_model.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])
feat_imp.sort_values(by='Importance')


# select all the features that have an importance > zero
plot_imp = feat_imp[ feat_imp['Importance'] > (10 ** -6) ].sort_values('Importance')
plot_imp


plt.figure(figsize=(8,4), dpi=100)

sns.barplot(data=plot_imp,
            x=plot_imp.index,
            y='Importance')
plt.xticks(rotation=90)
plt.title('Feature Importance for Single Decision Tree')

plt.show()


from sklearn.tree import plot_tree


plt.figure(figsize=(6,4), dpi=200)
plot_tree(sdt_model, filled=True, feature_names=X.columns);


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# default model
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)

# accuracy metrics
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", round(score,2), "\n")
plot_confusion_matrix(rfc_model, X_test, y_test);
print(classification_report(y_test, y_pred))

Accuracy Score: 0.78 

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       187
           1       0.80      0.75      0.77       187

    accuracy                           0.78       374
   macro avg       0.78      0.78      0.78       374
weighted avg       0.78      0.78      0.78       374


fig, ax = plt.subplots(figsize=(5,5), dpi=100)
plot_roc_curve(sdt_model, X_test, y_test, ax=ax)
plot_roc_curve(rfc_model, X_test, y_test, ax=ax)

plt.show()


# set up the parameters for grid search cross validation
n_estimators = [64, 100, 128, 200]
max_features = [2, 4, 8, 'sqrt', 'log2', None]
max_depth = [2, 4, 8, None]
# bootstrap = [True, False]
# oob_score = [True, False]

# param_grid is used as input for the GridSearchCV object
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth
#               'bootstrap': bootstrap #,
#               'oob_score': oob_score
             }


# grid search for optimal hyperparameters using random forests
rfc = RandomForestClassifier()
grid = GridSearchCV(rfc, param_grid, verbose=2)


# WARNING: this could take a long time to run!
run_grid_search = False

if (run_grid_search):
    grid.fit(X_train, y_train)
    grid.best_params_


if (run_grid_search):
    y_pred = grid.predict(X_test)
    print(classification_report(y_test, y_pred))
    plot_confusion_matrix(grid, X_test, y_test);


# REMEMBER THAT RECALL IS IMPORTANT IN THIS APPLICATION

# here we use the random forest classifer with several different "max depths" from 2 to N and record the error
errors = []
N = 30

for n in range (2, N):
    model = RandomForestClassifier(max_depth=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    e_n = 1 - accuracy_score(y_test, y_pred)
    errors.append(e_n)


# plot the classification error versus max depth of the tree
plt.figure(figsize=(5,5), dpi=100)
plt.plot(range(2, N), errors, 'ko--')
plt.xlabel('Max Depth')
plt.ylabel('Classification Error')
plt.ylim(0, 0.4)
plt.xlim(1, N)
plt.show()


# find the minimum error:
max_depth = np.arange(2, N, 1)
min_index = np.argmin(errors)
opt_depth = max_depth[min_index]
print("Minimum Error = ", errors[min_index])
print("Optimal Depth = ", opt_depth)

Minimum Error =  0.19518716577540107
Optimal Depth =  7


# create a new decision tree model with the optimal max depth
rfc_model = RandomForestClassifier(max_depth=opt_depth)
rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)
print(classification_report(y_test, y_pred))
plot_confusion_matrix(rfc_model, X_test, y_test);

              precision    recall  f1-score   support

           0       0.80      0.78      0.79       187
           1       0.79      0.80      0.79       187

    accuracy                           0.79       374
   macro avg       0.79      0.79      0.79       374
weighted avg       0.79      0.79      0.79       374


fig, ax = plt.subplots(figsize=(5,5), dpi=150)
plot_roc_curve(sdt_model, X_test, y_test, ax=ax);
plot_roc_curve(rfc_model, X_test, y_test, ax=ax);

plot_roc_curve(ada_model, X_test, y_test, ax=ax);
plot_roc_curve(gbc_model, X_test, y_test, ax=ax);

if (run_grid_search):
    plot_roc_curve(grid, X_test, y_test, ax=ax);


from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier


# default AdaBoost model 
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)

# accuracy metrics
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", round(score,2), "\n")
plot_confusion_matrix(ada_model, X_test, y_test);
print(classification_report(y_test, y_pred))

Accuracy Score: 0.8 

              precision    recall  f1-score   support

           0       0.80      0.80      0.80       187
           1       0.80      0.80      0.80       187

    accuracy                           0.80       374
   macro avg       0.80      0.80      0.80       374
weighted avg       0.80      0.80      0.80       374


# grid search with AdaBoostClassifier

# base model
ada = AdaBoostClassifier()

# parameters to test
n_estimators = [8, 16, 25, 50, 100]
learning_rate = [0.25, 0.5, 0.75, 1.0]

# set up the parameter grid
param_grid = {'n_estimators': n_estimators,
              'learning_rate': learning_rate}

# define the grid
ada_grid = GridSearchCV(ada, param_grid, verbose=2, cv=10)
ada_grid.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.25, n_estimators=8; total time=   0.0s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.25, n_estimators=50; total time=   0.4s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   0.9s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   0.8s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   1.1s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   0.9s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   0.9s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   0.8s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   1.0s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   1.3s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   1.1s
[CV] END ...............learning_rate=0.25, n_estimators=100; total time=   1.4s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=0.5, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=16; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.2s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.4s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.4s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.5s
[CV] END .................learning_rate=0.5, n_estimators=50; total time=   0.5s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.8s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.8s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=0.5, n_estimators=100; total time=   0.7s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=0.75, n_estimators=8; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.0s
[CV] END ................learning_rate=0.75, n_estimators=16; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.2s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.2s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.1s
[CV] END ................learning_rate=0.75, n_estimators=25; total time=   0.2s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.5s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.5s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.4s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=0.75, n_estimators=50; total time=   0.3s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.7s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.7s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.7s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.7s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.8s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   1.1s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   1.1s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.9s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   1.0s
[CV] END ...............learning_rate=0.75, n_estimators=100; total time=   0.9s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END ..................learning_rate=1.0, n_estimators=8; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=16; total time=   0.0s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=25; total time=   0.1s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END .................learning_rate=1.0, n_estimators=50; total time=   0.3s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.7s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.8s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.8s
[CV] END ................learning_rate=1.0, n_estimators=100; total time=   0.9s

GridSearchCV(cv=10, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.25, 0.5, 0.75, 1.0],
                         'n_estimators': [8, 16, 25, 50, 100]},
             verbose=2)


len(X_train)

3364


ada_grid.best_estimator_

AdaBoostClassifier(learning_rate=0.25, n_estimators=100)


y_pred = ada_grid.predict(X_test)
plot_confusion_matrix(ada_grid, X_test, y_test);
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       187
           1       0.81      0.83      0.82       187

    accuracy                           0.82       374
   macro avg       0.82      0.82      0.82       374
weighted avg       0.82      0.82      0.82       374


# run AdaBoost using a variety of estimators
errors = []
N = len(X.columns) + 1

for n in range(1, N):
    model = AdaBoostClassifier(n_estimators=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    e_n = 1 - accuracy_score(y_test, y_pred)
    errors.append(e_n)


# plot the classification error versus max depth of the tree
plt.figure(figsize=(5,5), dpi=100)
plt.plot(range(1, N), errors, 'ko--')
plt.xlabel('Number of Estimators')
plt.ylabel('Classification Error')
plt.ylim(0, 0.4)
plt.xlim(1, N)
plt.show()


# find the minimum error:
max_depth = np.arange(1, N, 1)
min_index = np.argmin(errors)
opt_depth = max_depth[min_index]
print("Minimum Error = ", errors[min_index])
print("Optimal n_estimators = ", opt_depth)

Minimum Error =  0.1925133689839572
Optimal n_estimators =  16


# AdaBoost with optimal number of estimators specified
ada_model = AdaBoostClassifier(n_estimators=opt_depth)
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)

# accuracy metrics
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", round(score,2), "\n")
plot_confusion_matrix(ada_model, X_test, y_test);
print(classification_report(y_test, y_pred))

Accuracy Score: 0.81 

              precision    recall  f1-score   support

           0       0.82      0.79      0.80       187
           1       0.80      0.82      0.81       187

    accuracy                           0.81       374
   macro avg       0.81      0.81      0.81       374
weighted avg       0.81      0.81      0.81       374


# default model
gbc_model = GradientBoostingClassifier()
gbc_model.fit(X_train, y_train)
y_pred = gbc_model.predict(X_test)

# accuracy metrics
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", round(score,2), "\n")
plot_confusion_matrix(gbc_model, X_test, y_test);
print(classification_report(y_test, y_pred))

Accuracy Score: 0.79 

              precision    recall  f1-score   support

           0       0.79      0.80      0.79       187
           1       0.80      0.79      0.79       187

    accuracy                           0.79       374
   macro avg       0.79      0.79      0.79       374
weighted avg       0.79      0.79      0.79       374


# check the feature importances
gbc_model.feature_importances_

array([6.85209501e-03, 2.07925829e-01, 8.38719366e-02, 8.66562433e-02,
       3.31277436e-03, 2.26861684e-03, 2.46110060e-03, 1.98579511e-03,
       5.22118749e-03, 3.96462705e-03, 1.03396985e-01, 2.46032628e-02,
       2.77568088e-04, 9.20297871e-03, 0.00000000e+00, 7.81634934e-04,
       0.00000000e+00, 8.92189495e-04, 2.21068280e-03, 1.00357852e-02,
       2.06761529e-04, 3.58991832e-03, 1.01684346e-02, 1.07725743e-02,
       1.12292657e-01, 2.49430267e-01, 1.53006351e-02, 2.90435071e-03,
       3.74782507e-02, 1.93485827e-03])


feat_imp = pd.DataFrame(data=gbc_model.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

# feat_imp.sort_values(by='Importance')

# select all the features that have an importance > zero
plot_imp = feat_imp[ feat_imp['Importance'] > (10 ** -6) ].sort_values('Importance')

plt.figure(figsize=(8,4), dpi=100)
sns.barplot(data=plot_imp,
            x=plot_imp.index,
            y='Importance')
plt.xticks(rotation=90)
plt.title('Feature Importance for Gradient-Boosted Classifier')

plt.show()


fig, ax = plt.subplots(figsize=(6,4), dpi=150)

if (run_grid_search):
    plot_precision_recall_curve(grid, X_test, y_test, ax=ax)

plot_precision_recall_curve(sdt_model, X_test, y_test, ax=ax)
plot_precision_recall_curve(rfc_model, X_test, y_test, ax=ax)
plot_precision_recall_curve(ada_model, X_test, y_test, ax=ax)
plot_precision_recall_curve(gbc_model, X_test, y_test, ax=ax)
plot_precision_recall_curve(ada_grid, X_test, y_test, ax=ax)

plt.legend(loc=(1.05,0.5))
plt.show()


fig, ax = plt.subplots(figsize=(5,5), dpi=150)

if (run_grid_search):
    plot_roc_curve(grid, X_test, y_test, ax=ax)

plot_roc_curve(sdt_model, X_test, y_test, ax=ax)
plot_roc_curve(rfc_model, X_test, y_test, ax=ax)
plot_roc_curve(ada_model, X_test, y_test, ax=ax)
plot_roc_curve(gbc_model, X_test, y_test, ax=ax)
plot_roc_curve(ada_grid, X_test, y_test, ax=ax)

plt.show()


from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


# for support vector machines, we must scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# support vector machine model with all default values
svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
plot_confusion_matrix(svm, X_test_scaled, y_test);

              precision    recall  f1-score   support

           0       0.78      0.76      0.77       187
           1       0.76      0.78      0.77       187

    accuracy                           0.77       374
   macro avg       0.77      0.77      0.77       374
weighted avg       0.77      0.77      0.77       374


# SVM Grid Search

svm = SVC()

param_grid = {'C': [0.01, 0.1, 1.0],
#               'kernel': ['linear', 'rbf', 'poly', 'sigmoid', 'precomputed'],
              'gamma': ['auto', 'scale']}

svm_grid = GridSearchCV(svm, param_grid, verbose=2)

svm_grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .................................C=0.01, gamma=auto; total time=   1.3s
[CV] END .................................C=0.01, gamma=auto; total time=   1.2s
[CV] END .................................C=0.01, gamma=auto; total time=   1.4s
[CV] END .................................C=0.01, gamma=auto; total time=   1.3s
[CV] END .................................C=0.01, gamma=auto; total time=   1.3s
[CV] END ................................C=0.01, gamma=scale; total time=   1.3s
[CV] END ................................C=0.01, gamma=scale; total time=   1.2s
[CV] END ................................C=0.01, gamma=scale; total time=   1.2s
[CV] END ................................C=0.01, gamma=scale; total time=   1.3s
[CV] END ................................C=0.01, gamma=scale; total time=   1.2s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.9s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.9s
[CV] END ..................................C=0.1, gamma=auto; total time=   1.0s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.9s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.9s
[CV] END .................................C=0.1, gamma=scale; total time=   0.9s
[CV] END .................................C=0.1, gamma=scale; total time=   0.9s
[CV] END .................................C=0.1, gamma=scale; total time=   0.9s
[CV] END .................................C=0.1, gamma=scale; total time=   0.9s
[CV] END .................................C=0.1, gamma=scale; total time=   0.9s
[CV] END ..................................C=1.0, gamma=auto; total time=   0.8s
[CV] END ..................................C=1.0, gamma=auto; total time=   0.9s
[CV] END ..................................C=1.0, gamma=auto; total time=   0.9s
[CV] END ..................................C=1.0, gamma=auto; total time=   0.9s
[CV] END ..................................C=1.0, gamma=auto; total time=   0.9s
[CV] END .................................C=1.0, gamma=scale; total time=   1.1s
[CV] END .................................C=1.0, gamma=scale; total time=   0.8s
[CV] END .................................C=1.0, gamma=scale; total time=   0.9s
[CV] END .................................C=1.0, gamma=scale; total time=   0.8s
[CV] END .................................C=1.0, gamma=scale; total time=   0.8s

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1.0], 'gamma': ['auto', 'scale']},
             verbose=2)


svm_grid.best_estimator_

SVC(gamma='auto')


y_pred = svm_grid.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
plot_confusion_matrix(svm_grid, X_test_scaled, y_test);

              precision    recall  f1-score   support

           0       0.78      0.76      0.77       187
           1       0.76      0.78      0.77       187

    accuracy                           0.77       374
   macro avg       0.77      0.77      0.77       374
weighted avg       0.77      0.77      0.77       374


from sklearn.neighbors import KNeighborsClassifier


# KNN model 

errors = []
k_max = 21

for K in range (1, k_max):
    model = KNeighborsClassifier(n_neighbors=K)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test)
    error_k = 1.0 - accuracy_score(y_test, y_pred)
    errors.append(error_k)

# plot the results
plt.figure(figsize=(5,5), dpi=100)
plt.plot(range(1, k_max), errors, 'ko--')
plt.xlabel('Number of Neighbors - K')
plt.ylabel('Classification Error')
# plt.ylim(0.49, 0.5)
# plt.xlim(1, k_max)
plt.show()


y_pred = ada_grid.predict(X_test)
print(classification_report(y_test, y_pred))
plot_confusion_matrix(ada_grid, X_test, y_test);

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       187
           1       0.81      0.83      0.82       187

    accuracy                           0.82       374
   macro avg       0.82      0.82      0.82       374
weighted avg       0.82      0.82      0.82       374


# how many total customers in the test data set?
n_total = len(y_test)
print("Total customers in test data:", n_total)

# how many people churned in the test data set?
n_churn_true = y_test.sum()
print("Number of customers who churned in test data:", n_churn_true)

Total customers in test data: 374
Number of customers who churned in test data: 187


from sklearn.metrics import confusion_matrix


C = confusion_matrix(y_test, y_pred)

true_neg = C[0,0]
true_pos = C[1,1]

false_neg = C[1,0]
false_pos = C[0,1]

# print(true_neg, true_pos)
# print(false_neg, false_pos)

acc = (true_pos + true_neg) / n_total
print("accuracy:     ", round(acc, 3))

rec_pos = true_pos / (true_pos + false_neg)  
print("recall (yes): ", round(rec_pos, 3))

rec_neg = true_neg / (true_neg + false_pos)  
print("recall (no):  ", round(rec_neg, 3))

accuracy:      0.816
recall (yes):  0.829
recall (no):   0.802


# review the features used in the previous model (all features: including the categorical ones converted to dummy variables)
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3738 entries, 0 to 3737
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          3738 non-null   int64  
 1   tenure                                 3738 non-null   int64  
 2   MonthlyCharges                         3738 non-null   float64
 3   TotalCharges                           3738 non-null   float64
 4   gender_Male                            3738 non-null   uint8  
 5   Partner_Yes                            3738 non-null   uint8  
 6   Dependents_Yes                         3738 non-null   uint8  
 7   PhoneService_Yes                       3738 non-null   uint8  
 8   MultipleLines_No phone service         3738 non-null   uint8  
 9   MultipleLines_Yes                      3738 non-null   uint8  
 10  InternetService_Fiber optic            3738 non-null   uint8  
 11  InternetService_No                     3738 non-null   uint8  
 12  OnlineSecurity_No internet service     3738 non-null   uint8  
 13  OnlineSecurity_Yes                     3738 non-null   uint8  
 14  OnlineBackup_No internet service       3738 non-null   uint8  
 15  OnlineBackup_Yes                       3738 non-null   uint8  
 16  DeviceProtection_No internet service   3738 non-null   uint8  
 17  DeviceProtection_Yes                   3738 non-null   uint8  
 18  TechSupport_No internet service        3738 non-null   uint8  
 19  TechSupport_Yes                        3738 non-null   uint8  
 20  StreamingTV_No internet service        3738 non-null   uint8  
 21  StreamingTV_Yes                        3738 non-null   uint8  
 22  StreamingMovies_No internet service    3738 non-null   uint8  
 23  StreamingMovies_Yes                    3738 non-null   uint8  
 24  Contract_One year                      3738 non-null   uint8  
 25  Contract_Two year                      3738 non-null   uint8  
 26  PaperlessBilling_Yes                   3738 non-null   uint8  
 27  PaymentMethod_Credit card (automatic)  3738 non-null   uint8  
 28  PaymentMethod_Electronic check         3738 non-null   uint8  
 29  PaymentMethod_Mailed check             3738 non-null   uint8  
 30  Churn_Yes                              3738 non-null   uint8  
dtypes: float64(2), int64(2), uint8(27)
memory usage: 215.5 KB


# check the correlation of each feature with the churn value
churn_corr = df_balanced.corr()['Churn_Yes'].sort_values()[:-1]
churn_corr

tenure                                  -0.409584
Contract_Two year                       -0.385397
OnlineSecurity_No internet service      -0.285517
StreamingMovies_No internet service     -0.285517
StreamingTV_No internet service         -0.285517
TechSupport_No internet service         -0.285517
DeviceProtection_No internet service    -0.285517
OnlineBackup_No internet service        -0.285517
InternetService_No                      -0.285517
TotalCharges                            -0.230071
Contract_One year                       -0.213543
OnlineSecurity_Yes                      -0.198377
Dependents_Yes                          -0.196249
TechSupport_Yes                         -0.194029
Partner_Yes                             -0.183928
PaymentMethod_Credit card (automatic)   -0.159442
PaymentMethod_Mailed check              -0.111604
DeviceProtection_Yes                    -0.073616
OnlineBackup_Yes                        -0.073217
MultipleLines_No phone service          -0.011893
gender_Male                             -0.009096
PhoneService_Yes                         0.011893
MultipleLines_Yes                        0.055158
StreamingMovies_Yes                      0.063728
StreamingTV_Yes                          0.080403
SeniorCitizen                            0.170468
PaperlessBilling_Yes                     0.227383
MonthlyCharges                           0.234127
PaymentMethod_Electronic check           0.320891
InternetService_Fiber optic              0.349644
Name: Churn_Yes, dtype: float64


# heat map of the important feature correlations
corr_columns = ['tenure', 
                'Contract_Two year', 
                'InternetService_Fiber optic', 
                'PaymentMethod_Electronic check', 
                'MonthlyCharges',
                'PaperlessBilling_Yes',
#                 'OnlineSecurity_No internet service',
#                 'StreamingMovies_No internet service',
#                 'StreamingTV_No internet service',
#                 'TechSupport_No internet service',
#                 'DeviceProtection_No internet service',
#                 'OnlineBackup_No internet service',
#                 'InternetService_No',
#                 'TotalCharges',
                'Churn_Yes']
sns.heatmap(df_balanced[corr_columns].corr(), cmap='rainbow', );


print("Features to the right are highly positively correlated to customer churn,")
print("while features on the left are negatively correlated to churn")
print("(i.e., features to the left are associated with customer retention).")

# plot the correlation of each feature with respect to churn
df_feat = pd.DataFrame(data=churn_corr)
plt.figure(figsize=(12,8), dpi=200)
sns.barplot(x=df_feat.index, y='Churn_Yes', data=df_feat)
plt.xticks(rotation=90)
plt.title("Feature Correlation to Customer Churn")
plt.tight_layout()
plt.show()

Features to the right are highly positively correlated to customer churn,
while features on the left are negatively correlated to churn
(i.e., features to the left are associated with customer retention).


# default Random Forest model
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)

# accuracy metrics
score = accuracy_score(y_test, y_pred)
print("Accuracy Score:", round(score,2), "\n")
plot_confusion_matrix(rfc_model, X_test, y_test);
print(classification_report(y_test, y_pred))

# check the feature importance
print("\nFeature Importances: ")
print(rfc_model.feature_importances_)

Accuracy Score: 0.77 

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       187
           1       0.80      0.73      0.76       187

    accuracy                           0.77       374
   macro avg       0.77      0.77      0.77       374
weighted avg       0.77      0.77      0.77       374


Feature Importances: 
[0.01974345 0.17335817 0.15304098 0.17434176 0.02636107 0.02218816
 0.01885476 0.00460451 0.00458847 0.0185293  0.04103252 0.00361221
 0.00670017 0.02700868 0.00664098 0.01962361 0.01416928 0.0186473
 0.00780351 0.02375531 0.00960736 0.01599634 0.00711979 0.01740595
 0.02457025 0.05217035 0.02552081 0.01462761 0.03596799 0.01240936]


# optimize number of features
errors = []
n_max = X_train.shape[1]
n_features = np.arange(2, n_max+1, 1)

for n in n_features:
    
    # train model and make predictions with test data
    model = RandomForestClassifier(max_features=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # record the errors
    error_n = 1.0 - accuracy_score(y_test, y_pred)
    errors.append(error_n)

# plot the results
plt.plot(n_features, errors, 'ko--')
plt.ylabel('Model Error')
plt.xlabel('Max Features')
plt.show()


# optimize number of features
m_depths = np.arange(2, 9, 1)

# now loop over max depth as well
for m in m_depths:
    #=============================================================================================
    errors = []
    for n in n_features:
    
        # train model and make predictions with test data
        model = RandomForestClassifier(max_features=n, max_depth=m)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # record the errors
        error_n = 1.0 - accuracy_score(y_test, y_pred)
        errors.append(error_n)
    #=============================================================================================
    # plot the results
    curve_label = "Max Depth = " + str(m)
    plt.plot(n_features, errors, label=curve_label);
    plt.ylim(0, 0.3)

# show the final plot
plt.ylabel('Model Error')
plt.xlabel('Max Features')
plt.legend()
plt.show()

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	7032.000000	7032.000000	7032.000000	7032.000000
mean	0.162400	32.421786	64.798208	2283.300441
std	0.368844	24.545260	30.085974	2266.771362
min	0.000000	1.000000	18.250000	18.800000
25%	0.000000	9.000000	35.587500	401.450000
50%	0.000000	29.000000	70.350000	1397.475000
75%	0.000000	55.000000	89.862500	3794.737500
max	1.000000	72.000000	118.750000	8684.800000

Client Churn Predictor - Comparison of Classification Models¶

Part 1: The Data¶

Part 2: Exploratory Data Analysis¶

Part 3: Cohort Analysis¶

Part 4: Predictive Modeling¶

Single Decision Tree Classifier¶

Random Forest Classifier¶

Boosted Trees¶

Additional Classification Methods¶

Support Vector Machines¶

K-Nearest Neighbors¶

Conclusion¶

Part 5: Next Steps ...¶

Feature Selection and Improving the Models¶

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.50	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

	Churn_Yes
Contract_Two year	-0.301552
DeviceProtection_No internet service	-0.227578
StreamingMovies_No internet service	-0.227578
StreamingTV_No internet service	-0.227578
InternetService_No	-0.227578
TechSupport_No internet service	-0.227578
OnlineBackup_No internet service	-0.227578
OnlineSecurity_No internet service	-0.227578
PaperlessBilling_No	-0.191454
Contract_One year	-0.178225
OnlineSecurity_Yes	-0.171270
TechSupport_Yes	-0.164716
Dependents_Yes	-0.163128
Partner_Yes	-0.149982
PaymentMethod_Credit card (automatic)	-0.134687
InternetService_DSL	-0.124141
PaymentMethod_Bank transfer (automatic)	-0.118136
PaymentMethod_Mailed check	-0.090773
OnlineBackup_Yes	-0.082307
DeviceProtection_Yes	-0.066193
MultipleLines_No	-0.032654
MultipleLines_No phone service	-0.011691
PhoneService_No	-0.011691
gender_Male	-0.008545
gender_Female	0.008545
PhoneService_Yes	0.011691
MultipleLines_Yes	0.040033
StreamingMovies_Yes	0.060860
StreamingTV_Yes	0.063254
StreamingTV_No	0.128435
StreamingMovies_No	0.130920
Partner_No	0.149982
SeniorCitizen	0.150541
Dependents_No	0.163128
PaperlessBilling_Yes	0.191454
DeviceProtection_No	0.252056
OnlineBackup_No	0.267595
PaymentMethod_Electronic check	0.301455
InternetService_Fiber optic	0.307463
TechSupport_No	0.336877
OnlineSecurity_No	0.342235
Contract_Month-to-month	0.404565

	total	churned	churn_percent
tenure
1	613	380	61.99
2	238	123	51.68
3	200	94	47.00
4	176	83	47.16
5	133	64	48.12
...	...	...	...
68	100	9	9.00
69	95	8	8.42
70	119	11	9.24
71	170	6	3.53
72	362	6	1.66

	gender_Male	Partner_Yes	Dependents_Yes	PhoneService_Yes	MultipleLines_No phone service	MultipleLines_Yes	InternetService_Fiber optic	InternetService_No	OnlineSecurity_No internet service	OnlineSecurity_Yes	...	StreamingTV_Yes	StreamingMovies_No internet service	StreamingMovies_Yes	Contract_One year	Contract_Two year	PaperlessBilling_Yes	PaymentMethod_Credit card (automatic)	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	0	1	0	0	1	0	0	0	0	0	...	0	0	0	0	0	1	0	1	0	0
1	1	0	0	1	0	0	0	0	0	1	...	0	0	0	1	0	0	0	0	1	0
2	1	0	0	1	0	0	0	0	0	1	...	0	0	0	0	0	1	0	0	1	1
3	1	0	0	0	1	0	0	0	0	1	...	0	0	0	1	0	0	0	0	0	0
4	0	0	0	1	0	0	1	0	0	0	...	0	0	0	0	0	1	0	1	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7027	1	1	1	1	0	1	0	0	0	1	...	1	0	1	1	0	1	0	0	1	0
7028	0	1	1	1	0	1	1	0	0	0	...	1	0	1	1	0	1	1	0	0	0
7029	0	1	1	0	1	0	0	0	0	1	...	0	0	0	0	0	1	0	1	0	0
7030	1	1	0	1	0	1	1	0	0	0	...	0	0	0	0	0	1	0	0	1	1
7031	1	0	0	1	0	0	1	0	0	1	...	1	0	1	0	1	1	0	0	0	0

	Importance
SeniorCitizen	0.000794
PaymentMethod_Electronic check	0.001868
MultipleLines_No phone service	0.002983
TotalCharges	0.010519
MonthlyCharges	0.053821
InternetService_Fiber optic	0.094585
tenure	0.129590
Contract_One year	0.259131
Contract_Two year	0.446710

Model Type	Recall (0)	Recall (1)	Accuracy
single decision tree (default)	0.64	0.68	0.68
single decision tree (max_depth=4)	0.78	0.76	0.77
random forest (default)	0.81	0.75	0.78
random forest (max_depth=7)	0.78	0.80	0.79
ada-boosted (default)	0.80	0.80	0.80
ada-boosted (n_estimators=16)	0.79	0.82	0.81
ada-boosted (grid search CV)	0.80	0.83	0.82
gradient-boosted (default)	0.80	0.79	0.79
support vector machines (default)	0.76	0.78	0.77