Logistic Regression Model of next payment default based on credit bureau information.

#Justin Gabb


from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.neighbors import KNeighborsClassifier as knc
from scipy import stats

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split 

import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

random_seed = 123

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

def seaborn_dist_plot(data, column_num1, column_num2, filter_value1, filter_value2):
    plt.figure(figsize=(15,5))
    filter1 = data.iloc[:,column_num2] == filter_value1
    result1 = data.iloc[:,column_num1][filter1]
    filter2 = data.iloc[:,column_num2] == filter_value2
    result2 = data.iloc[:,column_num1][filter2]
    sns.histplot(result1, color='coral', kde=False, label='0 Class', alpha=0.5)
    sns.histplot(result2, color='turquoise', kde=False, label='1 Class', alpha=0.5)
    plt.legend()
    plt.title('count # of {0} with {1} values'.format(data.iloc[:,column_num1].name, data.iloc[:,column_num2].name))
    return plt.show()

#Column filter function, pass dataset and integer value for column number as iloc index value
#Returns filter results, shape of results, and index values. 
def column_filter(data, column_num, filter_value):
    index_list = []
    filter1 = data.iloc[:,column_num] == filter_value      #configure filter parameter using: ==, >, <
    filter_result = data.iloc[:,column_num][filter1]
    shape1 = filter_result.shape
    index_list = filter_result.index.tolist()
    return filter_result, shape1, index_list

def box_plot(data, column_num1, column_num2):
    plt.figure(figsize=(8, 10))
    x = data.iloc[:,column_num2]
    y = data.iloc[:,column_num1]
    sns.boxplot(x, y)
    plt.title('Boxplot of {0} by {1}'.format(x.name, y.name))
    return plt.show()


path = ("C:/Users/Z33/Desktop/Data Science/U of T - Data Science/Data Science 4 - Machine Learning/Assignment 1 - Classification/default of credit card clients.xls")
credit_data = pd.read_excel(path, header=1, index_col=(0))


col_names = credit_data.columns
col_names

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')


credit_data.rename(columns={'default payment next month':'next_pmt_default'}, inplace=True)


credit_data.head()


credit_data.describe()


sns.set(style='ticks')
plt.figure(figsize=(20,20))
sns.pairplot(credit_data, hue='next_pmt_default')

<seaborn.axisgrid.PairGrid at 0x1b4355d2f40>

<Figure size 1440x1440 with 0 Axes>


for col in range(len(credit_data.columns)):
    box_plot(credit_data, col, 23)


for col in range(len(credit_data.columns)):
    seaborn_dist_plot(credit_data, col, 23, 1, 0)


for col in range(len(credit_data.columns)):
    filter_result, shape1, index_list = column_filter(credit_data, col, 0)
    print('This column: {0}, has this many zero values: {1}'.format(credit_data.iloc[:,col].name, shape1))

This column: LIMIT_BAL, has this many zero values: (0,)
This column: SEX, has this many zero values: (0,)
This column: EDUCATION, has this many zero values: (14,)
This column: MARRIAGE, has this many zero values: (54,)
This column: AGE, has this many zero values: (0,)
This column: PAY_0, has this many zero values: (14737,)
This column: PAY_2, has this many zero values: (15730,)
This column: PAY_3, has this many zero values: (15764,)
This column: PAY_4, has this many zero values: (16455,)
This column: PAY_5, has this many zero values: (16947,)
This column: PAY_6, has this many zero values: (16286,)
This column: BILL_AMT1, has this many zero values: (2008,)
This column: BILL_AMT2, has this many zero values: (2506,)
This column: BILL_AMT3, has this many zero values: (2870,)
This column: BILL_AMT4, has this many zero values: (3195,)
This column: BILL_AMT5, has this many zero values: (3506,)
This column: BILL_AMT6, has this many zero values: (4020,)
This column: PAY_AMT1, has this many zero values: (5249,)
This column: PAY_AMT2, has this many zero values: (5396,)
This column: PAY_AMT3, has this many zero values: (5968,)
This column: PAY_AMT4, has this many zero values: (6408,)
This column: PAY_AMT5, has this many zero values: (6703,)
This column: PAY_AMT6, has this many zero values: (7173,)
This column: next_pmt_default, has this many zero values: (23364,)


credit_data.isna().sum()

LIMIT_BAL           0
SEX                 0
EDUCATION           0
MARRIAGE            0
AGE                 0
PAY_0               0
PAY_2               0
PAY_3               0
PAY_4               0
PAY_5               0
PAY_6               0
BILL_AMT1           0
BILL_AMT2           0
BILL_AMT3           0
BILL_AMT4           0
BILL_AMT5           0
BILL_AMT6           0
PAY_AMT1            0
PAY_AMT2            0
PAY_AMT3            0
PAY_AMT4            0
PAY_AMT5            0
PAY_AMT6            0
next_pmt_default    0
dtype: int64


print(credit_data[credit_data.EDUCATION == 0].shape[0])
print(credit_data[credit_data.EDUCATION == 0].index.tolist())
drop_list1 = credit_data[credit_data.EDUCATION == 0].index.tolist()

14
[3770, 5946, 6877, 14632, 15108, 16882, 16897, 17415, 19921, 20031, 23235, 24138, 27156, 27271]


print(credit_data[credit_data.MARRIAGE == 0].shape[0])
print(credit_data[credit_data.MARRIAGE == 0].index.tolist())
drop_list2 = credit_data[credit_data.MARRIAGE == 0].index.tolist()

54
[219, 810, 821, 1020, 1444, 2147, 2555, 3057, 4471, 5006, 5346, 6390, 7941, 7956, 8887, 9089, 9974, 10209, 11753, 11926, 12051, 12079, 12733, 13826, 16582, 17286, 17530, 17577, 18307, 18536, 18949, 19343, 19387, 20120, 20450, 21560, 22591, 23030, 23104, 23136, 23361, 24217, 24444, 24722, 24985, 25309, 25703, 26251, 28458, 28603, 28604, 28767, 29079, 29112]


drop_list_final = drop_list1 + drop_list2

credit_data = credit_data.drop(index=drop_list_final)


credit_data.shape

(29932, 24)


corrMatrix = credit_data.corr()


plt.figure(figsize=(20,20))
sns.heatmap(corrMatrix, annot=True)
plt.show()


from sklearn.pipeline import Pipeline            #to store the steps of transformers and estimators
from sklearn.compose import ColumnTransformer    #for data that is formatted into panda column format
from sklearn.preprocessing import StandardScaler #may perform badly if data is not normally distributed
from sklearn.preprocessing import MinMaxScaler   #
from sklearn.preprocessing import RobustScaler   #if data contains many outliers this can handle outliers better then others
from sklearn.preprocessing import MaxAbsScaler   #preserves zero entries in sparse data, specifically designed for scaling sparse data


features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']


X2 = credit_data[features]

y2 = credit_data[['next_pmt_default']].values

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=random_seed)

print(X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape)

(23945, 23) (5987, 23) (23945, 1) (5987, 1)


numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features),])

model_rfc2 = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier_rfc', rfc(n_estimators=10))
                            ])
model_knc2 = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier_knc', knc(n_neighbors=10))
                            ])


#RandomForestClassifier 

ml_rfc2 = model_rfc2.fit(X_train2, np.ravel(y_train2))
y_pred2 = ml_rfc2.predict(X_test2)


#KNeighborsClassifier

ml_knc2 = model_knc2.fit(X_train2, np.ravel(y_train2))
y_pred3 = ml_knc2.predict(X_test2)


print ('Pipeline parameters for the RandomForestClassifier pipeline are:', ml_rfc2.get_params(deep=True))

Pipeline parameters for the RandomForestClassifier pipeline are: {'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
                                  'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
                                  'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
                                  'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
                                  'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                                  'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                                  'PAY_AMT6'])])), ('classifier_rfc', RandomForestClassifier(n_estimators=10))], 'verbose': False, 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
                                  'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
                                  'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
                                  'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
                                  'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                                  'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                                  'PAY_AMT6'])]), 'classifier_rfc': RandomForestClassifier(n_estimators=10), 'preprocessor__n_jobs': None, 'preprocessor__remainder': 'drop', 'preprocessor__sparse_threshold': 0.3, 'preprocessor__transformer_weights': None, 'preprocessor__transformers': [('num', Pipeline(steps=[('scaler', StandardScaler())]), ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])], 'preprocessor__verbose': False, 'preprocessor__num': Pipeline(steps=[('scaler', StandardScaler())]), 'preprocessor__num__memory': None, 'preprocessor__num__steps': [('scaler', StandardScaler())], 'preprocessor__num__verbose': False, 'preprocessor__num__scaler': StandardScaler(), 'preprocessor__num__scaler__copy': True, 'preprocessor__num__scaler__with_mean': True, 'preprocessor__num__scaler__with_std': True, 'classifier_rfc__bootstrap': True, 'classifier_rfc__ccp_alpha': 0.0, 'classifier_rfc__class_weight': None, 'classifier_rfc__criterion': 'gini', 'classifier_rfc__max_depth': None, 'classifier_rfc__max_features': 'auto', 'classifier_rfc__max_leaf_nodes': None, 'classifier_rfc__max_samples': None, 'classifier_rfc__min_impurity_decrease': 0.0, 'classifier_rfc__min_impurity_split': None, 'classifier_rfc__min_samples_leaf': 1, 'classifier_rfc__min_samples_split': 2, 'classifier_rfc__min_weight_fraction_leaf': 0.0, 'classifier_rfc__n_estimators': 10, 'classifier_rfc__n_jobs': None, 'classifier_rfc__oob_score': False, 'classifier_rfc__random_state': None, 'classifier_rfc__verbose': 0, 'classifier_rfc__warm_start': False}


print ('Pipeline parameters for the KNeighborsClassifer pipeline are:', ml_knc2.get_params(deep=True))

Pipeline parameters for the KNeighborsClassifer pipeline are: {'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
                                  'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
                                  'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
                                  'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
                                  'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                                  'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                                  'PAY_AMT6'])])), ('classifier_knc', KNeighborsClassifier(n_neighbors=10))], 'verbose': False, 'preprocessor': ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
                                  'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
                                  'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
                                  'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
                                  'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
                                  'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                                  'PAY_AMT6'])]), 'classifier_knc': KNeighborsClassifier(n_neighbors=10), 'preprocessor__n_jobs': None, 'preprocessor__remainder': 'drop', 'preprocessor__sparse_threshold': 0.3, 'preprocessor__transformer_weights': None, 'preprocessor__transformers': [('num', Pipeline(steps=[('scaler', StandardScaler())]), ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])], 'preprocessor__verbose': False, 'preprocessor__num': Pipeline(steps=[('scaler', StandardScaler())]), 'preprocessor__num__memory': None, 'preprocessor__num__steps': [('scaler', StandardScaler())], 'preprocessor__num__verbose': False, 'preprocessor__num__scaler': StandardScaler(), 'preprocessor__num__scaler__copy': True, 'preprocessor__num__scaler__with_mean': True, 'preprocessor__num__scaler__with_std': True, 'classifier_knc__algorithm': 'auto', 'classifier_knc__leaf_size': 30, 'classifier_knc__metric': 'minkowski', 'classifier_knc__metric_params': None, 'classifier_knc__n_jobs': None, 'classifier_knc__n_neighbors': 10, 'classifier_knc__p': 2, 'classifier_knc__weights': 'uniform'}


from sklearn.model_selection import GridSearchCV


params = {'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]}

gs_rfc2 = GridSearchCV(estimator = ml_rfc2, param_grid=params,scoring='roc_auc',cv=5,refit=True,n_jobs=-1,verbose=3)


params = {'classifier_knc__n_neighbors': [3, 5, 10, 20]}

gs_knc2 = GridSearchCV(estimator = ml_knc2, param_grid=params,scoring='roc_auc', verbose=3, cv=5, refit=True, n_jobs=-1)


gs_rfc2.fit(X=X2, y=np.ravel(y2))

Fitting 5 folds for each of 5 candidates, totalling 25 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   10.7s finished

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['LIMIT_BAL',
                                                                          'SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE',
                                                                          'AGE',
                                                                          'PAY_0',
                                                                          'PAY_2',
                                                                          'PAY_3',
                                                                          'PAY_4',
                                                                          'PAY_5',
                                                                          'PAY_6',
                                                                          'BILL_AMT1',
                                                                          'BILL_AMT2',
                                                                          'BILL_AMT3',
                                                                          'BILL_AMT4',
                                                                          'BILL_AMT5',
                                                                          'BILL_AMT6',
                                                                          'PAY_AMT1',
                                                                          'PAY_AMT2',
                                                                          'PAY_AMT3',
                                                                          'PAY_AMT4',
                                                                          'PAY_AMT5',
                                                                          'PAY_AMT6'])])),
                                       ('classifier_rfc',
                                        RandomForestClassifier(n_estimators=10))]),
             n_jobs=-1,
             param_grid={'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]},
             scoring='roc_auc', verbose=3)


gs_knc2.fit(X=X2, y=np.ravel(y2))

Fitting 5 folds for each of 4 candidates, totalling 20 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   54.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   54.4s finished

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['LIMIT_BAL',
                                                                          'SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE',
                                                                          'AGE',
                                                                          'PAY_0',
                                                                          'PAY_2',
                                                                          'PAY_3',
                                                                          'PAY_4',
                                                                          'PAY_5',
                                                                          'PAY_6',
                                                                          'BILL_AMT1',
                                                                          'BILL_AMT2',
                                                                          'BILL_AMT3',
                                                                          'BILL_AMT4',
                                                                          'BILL_AMT5',
                                                                          'BILL_AMT6',
                                                                          'PAY_AMT1',
                                                                          'PAY_AMT2',
                                                                          'PAY_AMT3',
                                                                          'PAY_AMT4',
                                                                          'PAY_AMT5',
                                                                          'PAY_AMT6'])])),
                                       ('classifier_knc',
                                        KNeighborsClassifier(n_neighbors=10))]),
             n_jobs=-1,
             param_grid={'classifier_knc__n_neighbors': [3, 5, 10, 20]},
             scoring='roc_auc', verbose=3)


print(gs_rfc2.best_params_)
print("\n",gs_rfc2.best_estimator_)

{'classifier_rfc__n_estimators': 50}

 Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['LIMIT_BAL', 'SEX',
                                                   'EDUCATION', 'MARRIAGE',
                                                   'AGE', 'PAY_0', 'PAY_2',
                                                   'PAY_3', 'PAY_4', 'PAY_5',
                                                   'PAY_6', 'BILL_AMT1',
                                                   'BILL_AMT2', 'BILL_AMT3',
                                                   'BILL_AMT4', 'BILL_AMT5',
                                                   'BILL_AMT6', 'PAY_AMT1',
                                                   'PAY_AMT2', 'PAY_AMT3',
                                                   'PAY_AMT4', 'PAY_AMT5',
                                                   'PAY_AMT6'])])),
                ('classifier_rfc', RandomForestClassifier(n_estimators=50))])


print(gs_knc2.best_params_)
print("\n",gs_knc2.best_estimator_)

{'classifier_knc__n_neighbors': 20}

 Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['LIMIT_BAL', 'SEX',
                                                   'EDUCATION', 'MARRIAGE',
                                                   'AGE', 'PAY_0', 'PAY_2',
                                                   'PAY_3', 'PAY_4', 'PAY_5',
                                                   'PAY_6', 'BILL_AMT1',
                                                   'BILL_AMT2', 'BILL_AMT3',
                                                   'BILL_AMT4', 'BILL_AMT5',
                                                   'BILL_AMT6', 'PAY_AMT1',
                                                   'PAY_AMT2', 'PAY_AMT3',
                                                   'PAY_AMT4', 'PAY_AMT5',
                                                   'PAY_AMT6'])])),
                ('classifier_knc', KNeighborsClassifier(n_neighbors=20))])


rfc2_final_model = gs_rfc2.best_estimator_
rfc2_final_model.score(X_test2, y_test2)

0.9989978286286955


knc2_final_model = gs_knc2.best_estimator_
knc2_final_model.score(X_test2, y_test2)

0.8264573242024387


Cross Validation


classifiers = [
    rfc2_final_model,
    knc2_final_model,
]
classifier_names = [
    'Random Forest Classifier 1 using StandardScaler',
    'KNeighbors Classifier 1 using StandardScaler',
]
for clf, clf_name in zip(classifiers, classifier_names):
    cv_scores = cross_val_score(clf, X_test2, np.ravel(y_test2), cv=5)
    
    print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')

Random Forest Classifier 1 using StandardScaler  mean accuracy:  81.961 % std:  0.007 %
KNeighbors Classifier 1 using StandardScaler  mean accuracy:  81.46 % std:  0.003 %


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.metrics import plot_roc_curve


rfc2_final_predict = rfc2_final_model.predict(X_test2)

print("Accuracy of RandomForestClassifier: {}%".format(round(accuracy_score(y_test2, rfc2_final_predict)*100, 2)))
plt.title('Confusion matrix for RandomForestClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, rfc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, rfc2_final_predict,digits=6))

Accuracy of RandomForestClassifier: 99.9%

              precision    recall  f1-score   support

           0   0.998930  0.999786  0.999358      4671
           1   0.999238  0.996201  0.997717      1316

    accuracy                       0.998998      5987
   macro avg   0.999084  0.997993  0.998537      5987
weighted avg   0.998998  0.998998  0.998997      5987


knc2_final_predict = knc2_final_model.predict(X_test2)

print("Accuracy of KNeighborsClassifier: {}%".format(round(accuracy_score(y_test2, knc2_final_predict)*100, 2)))
plt.title('Confusion matrix for KNeighborsClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, knc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, knc2_final_predict,digits=6))

Accuracy of KNeighborsClassifier: 82.65%

              precision    recall  f1-score   support

           0   0.839820  0.960822  0.896256      4671
           1   0.715397  0.349544  0.469627      1316

    accuracy                       0.826457      5987
   macro avg   0.777608  0.655183  0.682941      5987
weighted avg   0.812471  0.826457  0.802479      5987


rfc2_final_model.score(X_test2, y_test2)

0.9989978286286955


knc2_final_model.score(X_test2, y_test2)

0.8264573242024387


plt.figure(figsize=(10,5))
ax = plt.gca()
ax.set_title('Receiver Operating Curve for KNeighborsClassifier & RandomForest using StandardScaler')
plot_roc_curve(rfc2_final_model, X_test2, y_test2, ax=ax, label="RandomForestClassifier")
plot_roc_curve(knc2_final_model, X_test2, y_test2, ax=ax, label="KNeighborsClassifier")
plt.show()


metrics.plot_roc_curve(rfc2_final_model, X_test2, y_test2)
metrics.plot_roc_curve(knc2_final_model, X_test2, y_test2)
plt.title('RandomForestClassifier using StandardScaler')
#metrics.plot_roc_curve(final_model_knnc, x_train, y_train)
plt.show()


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve


disp = plot_precision_recall_curve(rfc2_final_model, X_test2, y_test2)
disp

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x211979da670>


disp = plot_precision_recall_curve(knc2_final_model, X_test2, y_test2)
disp

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x21197260ee0>


#Testing area below for differant configurations of classification models


from sklearn.preprocessing import MinMaxScaler   #
from sklearn.preprocessing import RobustScaler   #if data contains many outliers this can handle outliers better then others
from sklearn.preprocessing import MaxAbsScaler


numeric_transformer = Pipeline(steps=[('scaler', MaxAbsScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features),])

model_rfc2 = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier_rfc', rfc(n_estimators=10))
                            ])
model_knc2 = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier_knc', knc(n_neighbors=10))
                            ])


#RandomForestClassifier 

ml_rfc2 = model_rfc2.fit(X_train2, np.ravel(y_train2))
y_pred2 = ml_rfc2.predict(X_test2)


#KNeighborsClassifier

ml_knc2 = model_knc2.fit(X_train2, np.ravel(y_train2))
y_pred3 = ml_knc2.predict(X_test2)


params = {'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]}

gs_rfc2 = GridSearchCV(estimator = ml_rfc2, param_grid=params,scoring='roc_auc',cv=5,refit=True,n_jobs=-1,verbose=3)


params = {'classifier_knc__n_neighbors': [3, 5, 10, 20]}

gs_knc2 = GridSearchCV(estimator = ml_knc2, param_grid=params,scoring='roc_auc', verbose=3, cv=5, refit=True, n_jobs=-1)


gs_rfc2.fit(X=X2, y=np.ravel(y2))

Fitting 5 folds for each of 5 candidates, totalling 25 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.7s finished

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          MaxAbsScaler())]),
                                                                         ['LIMIT_BAL',
                                                                          'SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE',
                                                                          'AGE',
                                                                          'PAY_0',
                                                                          'PAY_2',
                                                                          'PAY_3',
                                                                          'PAY_4',
                                                                          'PAY_5',
                                                                          'PAY_6',
                                                                          'BILL_AMT1',
                                                                          'BILL_AMT2',
                                                                          'BILL_AMT3',
                                                                          'BILL_AMT4',
                                                                          'BILL_AMT5',
                                                                          'BILL_AMT6',
                                                                          'PAY_AMT1',
                                                                          'PAY_AMT2',
                                                                          'PAY_AMT3',
                                                                          'PAY_AMT4',
                                                                          'PAY_AMT5',
                                                                          'PAY_AMT6'])])),
                                       ('classifier_rfc',
                                        RandomForestClassifier(n_estimators=10))]),
             n_jobs=-1,
             param_grid={'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]},
             scoring='roc_auc', verbose=3)


gs_knc2.fit(X=X2, y=np.ravel(y2))

Fitting 5 folds for each of 4 candidates, totalling 20 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   22.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   22.3s finished

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          MaxAbsScaler())]),
                                                                         ['LIMIT_BAL',
                                                                          'SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE',
                                                                          'AGE',
                                                                          'PAY_0',
                                                                          'PAY_2',
                                                                          'PAY_3',
                                                                          'PAY_4',
                                                                          'PAY_5',
                                                                          'PAY_6',
                                                                          'BILL_AMT1',
                                                                          'BILL_AMT2',
                                                                          'BILL_AMT3',
                                                                          'BILL_AMT4',
                                                                          'BILL_AMT5',
                                                                          'BILL_AMT6',
                                                                          'PAY_AMT1',
                                                                          'PAY_AMT2',
                                                                          'PAY_AMT3',
                                                                          'PAY_AMT4',
                                                                          'PAY_AMT5',
                                                                          'PAY_AMT6'])])),
                                       ('classifier_knc',
                                        KNeighborsClassifier(n_neighbors=10))]),
             n_jobs=-1,
             param_grid={'classifier_knc__n_neighbors': [3, 5, 10, 20]},
             scoring='roc_auc', verbose=3)


print(gs_rfc2.best_params_)
print("\n",gs_rfc2.best_estimator_)

{'classifier_rfc__n_estimators': 50}

 Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MaxAbsScaler())]),
                                                  ['LIMIT_BAL', 'SEX',
                                                   'EDUCATION', 'MARRIAGE',
                                                   'AGE', 'PAY_0', 'PAY_2',
                                                   'PAY_3', 'PAY_4', 'PAY_5',
                                                   'PAY_6', 'BILL_AMT1',
                                                   'BILL_AMT2', 'BILL_AMT3',
                                                   'BILL_AMT4', 'BILL_AMT5',
                                                   'BILL_AMT6', 'PAY_AMT1',
                                                   'PAY_AMT2', 'PAY_AMT3',
                                                   'PAY_AMT4', 'PAY_AMT5',
                                                   'PAY_AMT6'])])),
                ('classifier_rfc', RandomForestClassifier(n_estimators=50))])


print(gs_knc2.best_params_)
print("\n",gs_knc2.best_estimator_)

{'classifier_knc__n_neighbors': 20}

 Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MaxAbsScaler())]),
                                                  ['LIMIT_BAL', 'SEX',
                                                   'EDUCATION', 'MARRIAGE',
                                                   'AGE', 'PAY_0', 'PAY_2',
                                                   'PAY_3', 'PAY_4', 'PAY_5',
                                                   'PAY_6', 'BILL_AMT1',
                                                   'BILL_AMT2', 'BILL_AMT3',
                                                   'BILL_AMT4', 'BILL_AMT5',
                                                   'BILL_AMT6', 'PAY_AMT1',
                                                   'PAY_AMT2', 'PAY_AMT3',
                                                   'PAY_AMT4', 'PAY_AMT5',
                                                   'PAY_AMT6'])])),
                ('classifier_knc', KNeighborsClassifier(n_neighbors=20))])


rfc2_final_model = gs_rfc2.best_estimator_
rfc2_final_model.score(X_test2, y_test2)

0.9986637715049274


knc2_final_model = gs_knc2.best_estimator_
knc2_final_model.score(X_test2, y_test2)

0.8277935526975113


classifiers = [
    rfc2_final_model,
    knc2_final_model,
]
classifier_names = [
    'Random Forest Classifier 1 using StandardScaler',
    'KNeighbors Classifier 1 using StandardScaler',
]
for clf, clf_name in zip(classifiers, classifier_names):
    cv_scores = cross_val_score(clf, X_test2, np.ravel(y_test2), cv=5)
    
    print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')

Random Forest Classifier 1 using StandardScaler  mean accuracy:  82.195 % std:  0.011 %
KNeighbors Classifier 1 using StandardScaler  mean accuracy:  81.259 % std:  0.005 %


rfc2_final_predict = rfc2_final_model.predict(X_test2)

print("Accuracy of RandomForestClassifier: {}%".format(round(accuracy_score(y_test2, rfc2_final_predict)*100, 2)))
plt.title('Confusion matrix for RandomForestClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, rfc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, rfc2_final_predict,digits=6))

Accuracy of RandomForestClassifier: 99.87%

              precision    recall  f1-score   support

           0   0.998503  0.999786  0.999144      4671
           1   0.999237  0.994681  0.996954      1316

    accuracy                       0.998664      5987
   macro avg   0.998870  0.997233  0.998049      5987
weighted avg   0.998665  0.998664  0.998663      5987


knc2_final_predict = knc2_final_model.predict(X_test2)

print("Accuracy of KNeighborsClassifier: {}%".format(round(accuracy_score(y_test2, knc2_final_predict)*100, 2)))
plt.title('Confusion matrix for KNeighborsClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, knc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, knc2_final_predict,digits=6))

Accuracy of KNeighborsClassifier: 82.78%

              precision    recall  f1-score   support

           0   0.840824  0.961250  0.897013      4671
           1   0.720247  0.354103  0.474783      1316

    accuracy                       0.827794      5987
   macro avg   0.780536  0.657677  0.685898      5987
weighted avg   0.814320  0.827794  0.804203      5987

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	next_pmt_default
ID
1	20000	2	2	1	24	2	2	-1	-1	-2	...	0	0	0	0	689	0	0	0	0	1
2	120000	2	2	2	26	-1	2	0	0	0	...	3272	3455	3261	0	1000	1000	1000	0	2000	1
3	90000	2	2	2	34	0	0	0	0	0	...	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
4	50000	2	2	1	37	0	0	0	0	0	...	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
5	50000	1	2	1	57	-1	0	-1	0	0	...	20940	19146	19131	2000	36681	10000	9000	689	679	0

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	next_pmt_default
count	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	30000.000000	...	30000.000000	30000.000000	30000.000000	30000.000000	3.000000e+04	30000.00000	30000.000000	30000.000000	30000.000000	30000.000000
mean	167484.322667	1.603733	1.853133	1.551867	35.485500	-0.016700	-0.133767	-0.166200	-0.220667	-0.266200	...	43262.948967	40311.400967	38871.760400	5663.580500	5.921163e+03	5225.68150	4826.076867	4799.387633	5215.502567	0.221200
std	129747.661567	0.489129	0.790349	0.521970	9.217904	1.123802	1.197186	1.196868	1.169139	1.133187	...	64332.856134	60797.155770	59554.107537	16563.280354	2.304087e+04	17606.96147	15666.159744	15278.305679	17777.465775	0.415062
min	10000.000000	1.000000	0.000000	0.000000	21.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	...	-170000.000000	-81334.000000	-339603.000000	0.000000	0.000000e+00	0.00000	0.000000	0.000000	0.000000	0.000000
25%	50000.000000	1.000000	1.000000	1.000000	28.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	...	2326.750000	1763.000000	1256.000000	1000.000000	8.330000e+02	390.00000	296.000000	252.500000	117.750000	0.000000
50%	140000.000000	2.000000	2.000000	2.000000	34.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	19052.000000	18104.500000	17071.000000	2100.000000	2.009000e+03	1800.00000	1500.000000	1500.000000	1500.000000	0.000000
75%	240000.000000	2.000000	2.000000	2.000000	41.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	54506.000000	50190.500000	49198.250000	5006.000000	5.000000e+03	4505.00000	4013.250000	4031.500000	4000.000000	0.000000
max	1000000.000000	2.000000	6.000000	3.000000	79.000000	8.000000	8.000000	8.000000	8.000000	8.000000	...	891586.000000	927171.000000	961664.000000	873552.000000	1.684259e+06	896040.00000	621000.000000	426529.000000	528666.000000	1.000000