import pandas as pd
import seaborn as sns
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import os


print(os.getcwd())
path = "F:\[Personal]\Data Analytics Portfolio\Churn Analysis\churn"
os.chdir(path)
print(os.getcwd())

C:\Users\Tyler
F:\[Personal]\Data Analytics Portfolio\Churn Analysis\churn


# disable warnings
import warnings
warnings.filterwarnings('ignore')


bank = pd.read_csv(r'F:\[Personal]\Data Analytics Portfolio\Churn Analysis\churn\bank.csv')


print(bank.head())
print(bank.dtypes)
bank.shape

   CustomerId   Surname  CreditScore Geography  Gender  Age  Tenure  \
0    15634602  Hargrave          619    France  Female   42       2   
1    15647311      Hill          608     Spain  Female   41       1   
2    15619304      Onio          502    France  Female   42       8   
3    15701354      Boni          699    France  Female   39       1   
4    15737888  Mitchell          850     Spain  Female   43       2   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  \
0       0.00              1          1               1        101348.88   
1   83807.86              1          0               1        112542.58   
2  159660.80              3          1               0        113931.57   
3       0.00              2          0               0         93826.63   
4  125510.82              1          1               1         79084.10   

   Exited  
0       1  
1       0  
2       1  
3       0  
4       0  
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

(10000, 13)


bank.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


bank.corr()


bank.columns

Index(['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')


dataset = bank.drop(['CustomerId', 'Surname'], axis = 1)


dataset = dataset.drop(['Geography', 'Gender'], axis = 1)


Geography = pd.get_dummies(bank.Geography).iloc[:,1:]
Gender = pd.get_dummies(bank.Gender).iloc[:,1:]


dataset = pd.concat([dataset, Geography, Gender], axis = 1)


dataset.head()


X = dataset.drop(['Exited'], axis = 1)
y = dataset['Exited']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the logistic regression model
logreg = LogisticRegression(fit_intercept=False, solver='liblinear')

# Define the grid of hyperparameters to search over
param_grid = {'C': np.logspace(-5, 5, 11)}

# Define the scorer to use for evaluating the models
scorer = make_scorer(f1_score)

# Perform a grid search over the hyperparameters using cross-validation
grid_search = GridSearchCV(logreg, param_grid=param_grid, cv=5, scoring=scorer)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'C': 0.001}
Best score: 0.07083600332803872


from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(fit_intercept = False, C = 0.001, solver = 'liblinear')
logreg.fit(X_train, y_train)

LogisticRegression(C=0.001, fit_intercept=False, solver='liblinear')

LogisticRegression(C=0.001, fit_intercept=False, solver='liblinear')


y_pred = logreg.predict(X_train)
y_hat_test = logreg.predict(X_test)


residuals = np.abs(y_train, y_pred)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize = True))

0    6356
1    1644
Name: Exited, dtype: int64
0    0.7945
1    0.2055
Name: Exited, dtype: float64


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

precision_train = precision_score(y_train, y_pred)
precision_test = precision_score(y_test, y_hat_test)

recall_train = recall_score(y_train, y_pred)
recall_test = recall_score(y_test, y_hat_test)

accuracy_train = accuracy_score(y_train, y_pred)
accuracy_test = accuracy_score(y_test, y_hat_test)

f1_train = f1_score(y_train, y_pred)
f1_test = f1_score(y_test, y_hat_test)


print(precision_train)
print(precision_test)

print(recall_train)
print(recall_test)

print(accuracy_train)
print(accuracy_test)

print(f1_train)
print(f1_test)

1.0
0.47540983606557374
1.0
0.0737913486005089
1.0
0.802
1.0
0.1277533039647577


logreg.predict_proba(X_test)[:,1]

array([0.22214244, 0.20307728, 0.24135997, ..., 0.15939585, 0.09307679,
       0.25307851])


y_pred_prob = logreg.predict_proba(X_test)[:,1]


from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0,1], [0,1], 'k--')
plt.show()


from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test, y_pred_prob)
print("AUC: {:.3f}".format(auc))

AUC: 0.675


from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 200, random_state = 42)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)


from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.78      0.48      0.59       393

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000

0.8705


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the grid search parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search object
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found by the grid search
print(grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}


classifier = RandomForestClassifier(n_estimators = 300, max_depth = 30, min_samples_leaf = 1, min_samples_split = 10, random_state = 42)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)


print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.58       393

    accuracy                           0.86      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.86      0.85      2000

0.865


# predict probabilities of churn for new customers
y_prob = classifier.predict_proba(X_test)

# extract the probabilities for class 1 (churn) only
churn_probabilities = y_prob[:,1]

# predict churn for a new customer
new_customer_data = [2, 0, 35, 2, 50000, 2, 1, 1, 1, 2, 50000]
churn_probability = classifier.predict_proba([new_customer_data])[0, 1]
if churn_probability >= 0.5:
    print("New customer is likely to churn with probability", churn_probability)
else:
    print("New customer is unlikely to churn with probability", 1 - churn_probability)

New customer is likely to churn with probability 0.6174396429450608


feature_importances = pd.Series(classifier.feature_importances_, index = X.columns)
feature_importances.nlargest(10).plot(kind = 'barh')

<Axes: >

	CustomerId	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
CustomerId	1.000000	0.005308	0.009497	-0.014883	-0.012419	0.016972	-0.014025	0.001665	0.015271	-0.006248
CreditScore	0.005308	1.000000	-0.003965	0.000842	0.006268	0.012238	-0.005458	0.025651	-0.001384	-0.027094
Age	0.009497	-0.003965	1.000000	-0.009997	0.028308	-0.030680	-0.011721	0.085472	-0.007201	0.285323
Tenure	-0.014883	0.000842	-0.009997	1.000000	-0.012254	0.013444	0.022583	-0.028362	0.007784	-0.014001
Balance	-0.012419	0.006268	0.028308	-0.012254	1.000000	-0.304180	-0.014858	-0.010084	0.012797	0.118533
NumOfProducts	0.016972	0.012238	-0.030680	0.013444	-0.304180	1.000000	0.003183	0.009612	0.014204	-0.047820
HasCrCard	-0.014025	-0.005458	-0.011721	0.022583	-0.014858	0.003183	1.000000	-0.011866	-0.009933	-0.007138
IsActiveMember	0.001665	0.025651	0.085472	-0.028362	-0.010084	0.009612	-0.011866	1.000000	-0.011421	-0.156128
EstimatedSalary	0.015271	-0.001384	-0.007201	0.007784	0.012797	0.014204	-0.009933	-0.011421	1.000000	0.012097
Exited	-0.006248	-0.027094	0.285323	-0.014001	0.118533	-0.047820	-0.007138	-0.156128	0.012097	1.000000

Churn Analysis Using Banking Dataset¶

Comments and Introductory Remarks¶

Importing Packages, Setting up Environment, and Exploring Data¶

Converting Categorical Columns to Numeric Columns¶

Data Preprocessing¶

Logistic Regression¶

Random Forests¶

Visualizing Top 10 Features for Predicting Churn¶

Which Model Performed Better?¶

	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited	Spain
0	619	42	2	0.00	1	1	1	101348.88	1	0
1	608	41	1	83807.86	1	0	1	112542.58	0	1
2	502	42	8	159660.80	3	1	0	113931.57	1	0
3	699	39	1	0.00	2	0	0	93826.63	0	0
4	850	43	2	125510.82	1	1	1	79084.10	0	1