Logistic Regression Model of next payment default based on credit bureau information.
#Justin Gabb
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.neighbors import KNeighborsClassifier as knc
from scipy import stats
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
random_seed = 123
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
def seaborn_dist_plot(data, column_num1, column_num2, filter_value1, filter_value2):
plt.figure(figsize=(15,5))
filter1 = data.iloc[:,column_num2] == filter_value1
result1 = data.iloc[:,column_num1][filter1]
filter2 = data.iloc[:,column_num2] == filter_value2
result2 = data.iloc[:,column_num1][filter2]
sns.histplot(result1, color='coral', kde=False, label='0 Class', alpha=0.5)
sns.histplot(result2, color='turquoise', kde=False, label='1 Class', alpha=0.5)
plt.legend()
plt.title('count # of {0} with {1} values'.format(data.iloc[:,column_num1].name, data.iloc[:,column_num2].name))
return plt.show()
#Column filter function, pass dataset and integer value for column number as iloc index value
#Returns filter results, shape of results, and index values.
def column_filter(data, column_num, filter_value):
index_list = []
filter1 = data.iloc[:,column_num] == filter_value #configure filter parameter using: ==, >, <
filter_result = data.iloc[:,column_num][filter1]
shape1 = filter_result.shape
index_list = filter_result.index.tolist()
return filter_result, shape1, index_list
def box_plot(data, column_num1, column_num2):
plt.figure(figsize=(8, 10))
x = data.iloc[:,column_num2]
y = data.iloc[:,column_num1]
sns.boxplot(x, y)
plt.title('Boxplot of {0} by {1}'.format(x.name, y.name))
return plt.show()
path = ("C:/Users/Z33/Desktop/Data Science/U of T - Data Science/Data Science 4 - Machine Learning/Assignment 1 - Classification/default of credit card clients.xls")
credit_data = pd.read_excel(path, header=1, index_col=(0))
Data Exploration & Analysis
col_names = credit_data.columns
col_names
Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
'default payment next month'],
dtype='object')
credit_data.rename(columns={'default payment next month':'next_pmt_default'}, inplace=True)
credit_data.head()
| LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | next_pmt_default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||||||
| 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | -2 | ... | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | 0 | ... | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | 0 | ... | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
| 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | 0 | ... | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
| 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | 0 | ... | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
5 rows × 24 columns
credit_data.describe()
| LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | next_pmt_default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | ... | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 | 3.000000e+04 | 30000.00000 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 |
| mean | 167484.322667 | 1.603733 | 1.853133 | 1.551867 | 35.485500 | -0.016700 | -0.133767 | -0.166200 | -0.220667 | -0.266200 | ... | 43262.948967 | 40311.400967 | 38871.760400 | 5663.580500 | 5.921163e+03 | 5225.68150 | 4826.076867 | 4799.387633 | 5215.502567 | 0.221200 |
| std | 129747.661567 | 0.489129 | 0.790349 | 0.521970 | 9.217904 | 1.123802 | 1.197186 | 1.196868 | 1.169139 | 1.133187 | ... | 64332.856134 | 60797.155770 | 59554.107537 | 16563.280354 | 2.304087e+04 | 17606.96147 | 15666.159744 | 15278.305679 | 17777.465775 | 0.415062 |
| min | 10000.000000 | 1.000000 | 0.000000 | 0.000000 | 21.000000 | -2.000000 | -2.000000 | -2.000000 | -2.000000 | -2.000000 | ... | -170000.000000 | -81334.000000 | -339603.000000 | 0.000000 | 0.000000e+00 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 50000.000000 | 1.000000 | 1.000000 | 1.000000 | 28.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | ... | 2326.750000 | 1763.000000 | 1256.000000 | 1000.000000 | 8.330000e+02 | 390.00000 | 296.000000 | 252.500000 | 117.750000 | 0.000000 |
| 50% | 140000.000000 | 2.000000 | 2.000000 | 2.000000 | 34.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 19052.000000 | 18104.500000 | 17071.000000 | 2100.000000 | 2.009000e+03 | 1800.00000 | 1500.000000 | 1500.000000 | 1500.000000 | 0.000000 |
| 75% | 240000.000000 | 2.000000 | 2.000000 | 2.000000 | 41.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 54506.000000 | 50190.500000 | 49198.250000 | 5006.000000 | 5.000000e+03 | 4505.00000 | 4013.250000 | 4031.500000 | 4000.000000 | 0.000000 |
| max | 1000000.000000 | 2.000000 | 6.000000 | 3.000000 | 79.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | ... | 891586.000000 | 927171.000000 | 961664.000000 | 873552.000000 | 1.684259e+06 | 896040.00000 | 621000.000000 | 426529.000000 | 528666.000000 | 1.000000 |
8 rows × 24 columns
sns.set(style='ticks')
plt.figure(figsize=(20,20))
sns.pairplot(credit_data, hue='next_pmt_default')
<seaborn.axisgrid.PairGrid at 0x1b4355d2f40>
<Figure size 1440x1440 with 0 Axes>
for col in range(len(credit_data.columns)):
box_plot(credit_data, col, 23)
for col in range(len(credit_data.columns)):
seaborn_dist_plot(credit_data, col, 23, 1, 0)
for col in range(len(credit_data.columns)):
filter_result, shape1, index_list = column_filter(credit_data, col, 0)
print('This column: {0}, has this many zero values: {1}'.format(credit_data.iloc[:,col].name, shape1))
This column: LIMIT_BAL, has this many zero values: (0,) This column: SEX, has this many zero values: (0,) This column: EDUCATION, has this many zero values: (14,) This column: MARRIAGE, has this many zero values: (54,) This column: AGE, has this many zero values: (0,) This column: PAY_0, has this many zero values: (14737,) This column: PAY_2, has this many zero values: (15730,) This column: PAY_3, has this many zero values: (15764,) This column: PAY_4, has this many zero values: (16455,) This column: PAY_5, has this many zero values: (16947,) This column: PAY_6, has this many zero values: (16286,) This column: BILL_AMT1, has this many zero values: (2008,) This column: BILL_AMT2, has this many zero values: (2506,) This column: BILL_AMT3, has this many zero values: (2870,) This column: BILL_AMT4, has this many zero values: (3195,) This column: BILL_AMT5, has this many zero values: (3506,) This column: BILL_AMT6, has this many zero values: (4020,) This column: PAY_AMT1, has this many zero values: (5249,) This column: PAY_AMT2, has this many zero values: (5396,) This column: PAY_AMT3, has this many zero values: (5968,) This column: PAY_AMT4, has this many zero values: (6408,) This column: PAY_AMT5, has this many zero values: (6703,) This column: PAY_AMT6, has this many zero values: (7173,) This column: next_pmt_default, has this many zero values: (23364,)
Data Cleaning
Lets examine the Education and Marriage columns closer as they appear to have zero values when they should not. Pay, Bill and next_pmt_default columns are supposed to have zero values as they represent either a zero amount of a bill, payment or no default happened.
credit_data.isna().sum()
LIMIT_BAL 0 SEX 0 EDUCATION 0 MARRIAGE 0 AGE 0 PAY_0 0 PAY_2 0 PAY_3 0 PAY_4 0 PAY_5 0 PAY_6 0 BILL_AMT1 0 BILL_AMT2 0 BILL_AMT3 0 BILL_AMT4 0 BILL_AMT5 0 BILL_AMT6 0 PAY_AMT1 0 PAY_AMT2 0 PAY_AMT3 0 PAY_AMT4 0 PAY_AMT5 0 PAY_AMT6 0 next_pmt_default 0 dtype: int64
print(credit_data[credit_data.EDUCATION == 0].shape[0])
print(credit_data[credit_data.EDUCATION == 0].index.tolist())
drop_list1 = credit_data[credit_data.EDUCATION == 0].index.tolist()
14 [3770, 5946, 6877, 14632, 15108, 16882, 16897, 17415, 19921, 20031, 23235, 24138, 27156, 27271]
print(credit_data[credit_data.MARRIAGE == 0].shape[0])
print(credit_data[credit_data.MARRIAGE == 0].index.tolist())
drop_list2 = credit_data[credit_data.MARRIAGE == 0].index.tolist()
54 [219, 810, 821, 1020, 1444, 2147, 2555, 3057, 4471, 5006, 5346, 6390, 7941, 7956, 8887, 9089, 9974, 10209, 11753, 11926, 12051, 12079, 12733, 13826, 16582, 17286, 17530, 17577, 18307, 18536, 18949, 19343, 19387, 20120, 20450, 21560, 22591, 23030, 23104, 23136, 23361, 24217, 24444, 24722, 24985, 25309, 25703, 26251, 28458, 28603, 28604, 28767, 29079, 29112]
drop_list_final = drop_list1 + drop_list2
credit_data = credit_data.drop(index=drop_list_final)
credit_data.shape
(29932, 24)
Correlation Analysis
corrMatrix = credit_data.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corrMatrix, annot=True)
plt.show()
Train Test splitting and assignment of data
from sklearn.pipeline import Pipeline #to store the steps of transformers and estimators
from sklearn.compose import ColumnTransformer #for data that is formatted into panda column format
from sklearn.preprocessing import StandardScaler #may perform badly if data is not normally distributed
from sklearn.preprocessing import MinMaxScaler #
from sklearn.preprocessing import RobustScaler #if data contains many outliers this can handle outliers better then others
from sklearn.preprocessing import MaxAbsScaler #preserves zero entries in sparse data, specifically designed for scaling sparse data
features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
X2 = credit_data[features]
y2 = credit_data[['next_pmt_default']].values
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=random_seed)
print(X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape)
(23945, 23) (5987, 23) (23945, 1) (5987, 1)
Transformer and estimator Pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features),])
model_rfc2 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier_rfc', rfc(n_estimators=10))
])
model_knc2 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier_knc', knc(n_neighbors=10))
])
Training the model on data
#RandomForestClassifier
ml_rfc2 = model_rfc2.fit(X_train2, np.ravel(y_train2))
y_pred2 = ml_rfc2.predict(X_test2)
#KNeighborsClassifier
ml_knc2 = model_knc2.fit(X_train2, np.ravel(y_train2))
y_pred3 = ml_knc2.predict(X_test2)
print ('Pipeline parameters for the RandomForestClassifier pipeline are:', ml_rfc2.get_params(deep=True))
Pipeline parameters for the RandomForestClassifier pipeline are: {'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler', StandardScaler())]),
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])), ('classifier_rfc', RandomForestClassifier(n_estimators=10))], 'verbose': False, 'preprocessor': ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler', StandardScaler())]),
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])]), 'classifier_rfc': RandomForestClassifier(n_estimators=10), 'preprocessor__n_jobs': None, 'preprocessor__remainder': 'drop', 'preprocessor__sparse_threshold': 0.3, 'preprocessor__transformer_weights': None, 'preprocessor__transformers': [('num', Pipeline(steps=[('scaler', StandardScaler())]), ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])], 'preprocessor__verbose': False, 'preprocessor__num': Pipeline(steps=[('scaler', StandardScaler())]), 'preprocessor__num__memory': None, 'preprocessor__num__steps': [('scaler', StandardScaler())], 'preprocessor__num__verbose': False, 'preprocessor__num__scaler': StandardScaler(), 'preprocessor__num__scaler__copy': True, 'preprocessor__num__scaler__with_mean': True, 'preprocessor__num__scaler__with_std': True, 'classifier_rfc__bootstrap': True, 'classifier_rfc__ccp_alpha': 0.0, 'classifier_rfc__class_weight': None, 'classifier_rfc__criterion': 'gini', 'classifier_rfc__max_depth': None, 'classifier_rfc__max_features': 'auto', 'classifier_rfc__max_leaf_nodes': None, 'classifier_rfc__max_samples': None, 'classifier_rfc__min_impurity_decrease': 0.0, 'classifier_rfc__min_impurity_split': None, 'classifier_rfc__min_samples_leaf': 1, 'classifier_rfc__min_samples_split': 2, 'classifier_rfc__min_weight_fraction_leaf': 0.0, 'classifier_rfc__n_estimators': 10, 'classifier_rfc__n_jobs': None, 'classifier_rfc__oob_score': False, 'classifier_rfc__random_state': None, 'classifier_rfc__verbose': 0, 'classifier_rfc__warm_start': False}
print ('Pipeline parameters for the KNeighborsClassifer pipeline are:', ml_knc2.get_params(deep=True))
Pipeline parameters for the KNeighborsClassifer pipeline are: {'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler', StandardScaler())]),
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])), ('classifier_knc', KNeighborsClassifier(n_neighbors=10))], 'verbose': False, 'preprocessor': ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler', StandardScaler())]),
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])]), 'classifier_knc': KNeighborsClassifier(n_neighbors=10), 'preprocessor__n_jobs': None, 'preprocessor__remainder': 'drop', 'preprocessor__sparse_threshold': 0.3, 'preprocessor__transformer_weights': None, 'preprocessor__transformers': [('num', Pipeline(steps=[('scaler', StandardScaler())]), ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])], 'preprocessor__verbose': False, 'preprocessor__num': Pipeline(steps=[('scaler', StandardScaler())]), 'preprocessor__num__memory': None, 'preprocessor__num__steps': [('scaler', StandardScaler())], 'preprocessor__num__verbose': False, 'preprocessor__num__scaler': StandardScaler(), 'preprocessor__num__scaler__copy': True, 'preprocessor__num__scaler__with_mean': True, 'preprocessor__num__scaler__with_std': True, 'classifier_knc__algorithm': 'auto', 'classifier_knc__leaf_size': 30, 'classifier_knc__metric': 'minkowski', 'classifier_knc__metric_params': None, 'classifier_knc__n_jobs': None, 'classifier_knc__n_neighbors': 10, 'classifier_knc__p': 2, 'classifier_knc__weights': 'uniform'}
Grid Search using parameters n_neighbors for RandomForestClassifier and n_estimators for KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
params = {'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]}
gs_rfc2 = GridSearchCV(estimator = ml_rfc2, param_grid=params,scoring='roc_auc',cv=5,refit=True,n_jobs=-1,verbose=3)
params = {'classifier_knc__n_neighbors': [3, 5, 10, 20]}
gs_knc2 = GridSearchCV(estimator = ml_knc2, param_grid=params,scoring='roc_auc', verbose=3, cv=5, refit=True, n_jobs=-1)
gs_rfc2.fit(X=X2, y=np.ravel(y2))
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 25 out of 25 | elapsed: 10.7s finished
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL',
'SEX',
'EDUCATION',
'MARRIAGE',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT3',
'PAY_AMT4',
'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_rfc',
RandomForestClassifier(n_estimators=10))]),
n_jobs=-1,
param_grid={'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]},
scoring='roc_auc', verbose=3)
gs_knc2.fit(X=X2, y=np.ravel(y2))
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 54.4s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 54.4s finished
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL',
'SEX',
'EDUCATION',
'MARRIAGE',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT3',
'PAY_AMT4',
'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_knc',
KNeighborsClassifier(n_neighbors=10))]),
n_jobs=-1,
param_grid={'classifier_knc__n_neighbors': [3, 5, 10, 20]},
scoring='roc_auc', verbose=3)
print(gs_rfc2.best_params_)
print("\n",gs_rfc2.best_estimator_)
{'classifier_rfc__n_estimators': 50}
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL', 'SEX',
'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5',
'PAY_6', 'BILL_AMT1',
'BILL_AMT2', 'BILL_AMT3',
'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3',
'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_rfc', RandomForestClassifier(n_estimators=50))])
print(gs_knc2.best_params_)
print("\n",gs_knc2.best_estimator_)
{'classifier_knc__n_neighbors': 20}
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL', 'SEX',
'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5',
'PAY_6', 'BILL_AMT1',
'BILL_AMT2', 'BILL_AMT3',
'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3',
'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_knc', KNeighborsClassifier(n_neighbors=20))])
rfc2_final_model = gs_rfc2.best_estimator_
rfc2_final_model.score(X_test2, y_test2)
0.9989978286286955
knc2_final_model = gs_knc2.best_estimator_
knc2_final_model.score(X_test2, y_test2)
0.8264573242024387
Cross Validation
classifiers = [
rfc2_final_model,
knc2_final_model,
]
classifier_names = [
'Random Forest Classifier 1 using StandardScaler',
'KNeighbors Classifier 1 using StandardScaler',
]
for clf, clf_name in zip(classifiers, classifier_names):
cv_scores = cross_val_score(clf, X_test2, np.ravel(y_test2), cv=5)
print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')
Random Forest Classifier 1 using StandardScaler mean accuracy: 81.961 % std: 0.007 % KNeighbors Classifier 1 using StandardScaler mean accuracy: 81.46 % std: 0.003 %
Model Evaluation using confusion matrix and ROC curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.metrics import plot_roc_curve
rfc2_final_predict = rfc2_final_model.predict(X_test2)
print("Accuracy of RandomForestClassifier: {}%".format(round(accuracy_score(y_test2, rfc2_final_predict)*100, 2)))
plt.title('Confusion matrix for RandomForestClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, rfc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, rfc2_final_predict,digits=6))
Accuracy of RandomForestClassifier: 99.9%
precision recall f1-score support
0 0.998930 0.999786 0.999358 4671
1 0.999238 0.996201 0.997717 1316
accuracy 0.998998 5987
macro avg 0.999084 0.997993 0.998537 5987
weighted avg 0.998998 0.998998 0.998997 5987
knc2_final_predict = knc2_final_model.predict(X_test2)
print("Accuracy of KNeighborsClassifier: {}%".format(round(accuracy_score(y_test2, knc2_final_predict)*100, 2)))
plt.title('Confusion matrix for KNeighborsClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, knc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, knc2_final_predict,digits=6))
Accuracy of KNeighborsClassifier: 82.65%
precision recall f1-score support
0 0.839820 0.960822 0.896256 4671
1 0.715397 0.349544 0.469627 1316
accuracy 0.826457 5987
macro avg 0.777608 0.655183 0.682941 5987
weighted avg 0.812471 0.826457 0.802479 5987
rfc2_final_model.score(X_test2, y_test2)
0.9989978286286955
knc2_final_model.score(X_test2, y_test2)
0.8264573242024387
plt.figure(figsize=(10,5))
ax = plt.gca()
ax.set_title('Receiver Operating Curve for KNeighborsClassifier & RandomForest using StandardScaler')
plot_roc_curve(rfc2_final_model, X_test2, y_test2, ax=ax, label="RandomForestClassifier")
plot_roc_curve(knc2_final_model, X_test2, y_test2, ax=ax, label="KNeighborsClassifier")
plt.show()
metrics.plot_roc_curve(rfc2_final_model, X_test2, y_test2)
metrics.plot_roc_curve(knc2_final_model, X_test2, y_test2)
plt.title('RandomForestClassifier using StandardScaler')
#metrics.plot_roc_curve(final_model_knnc, x_train, y_train)
plt.show()
Model evaluation using precision recall curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
disp = plot_precision_recall_curve(rfc2_final_model, X_test2, y_test2)
disp
<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x211979da670>
disp = plot_precision_recall_curve(knc2_final_model, X_test2, y_test2)
disp
<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x21197260ee0>
Through extensive testing the StandardScaler() data transformer was chosen as it had the highest score for both Randomforest and KNeighbors. It was compared to the following data transformers. Too keep the project streamlined only the final version is presented: Pipeline1 MinMaxScaler() StandardScaler() RobustScaler() MaxAbs() KNeighbors 80.26 82.65 78.15 80.58 RandomForest 81.06 99.89 81.08 81.22 The highest scoring hyperparameters are: RandomForestClassifier: {'classifier_rfc__n_estimators': 50} KNeighborsClassifier: {'classifier_knc__n_neighbors': 20} The RandomForestClassifier performs best in the cross validation scores: Random Forest Classifier 1 using StandardScaler mean accuracy: 82.495 % std: 0.005 % KNeighbors Classifier 1 using StandardScaler mean accuracy: 81.46 % std: 0.003 % I attempted to group the data into different groups to see if scores would increase but they did not, I chose what I thought were sparse data matrix type numbers in one group and then regular numbers in another as such and applied differant data transformers to them but in the end StandardScaler() using all data in one group yielded the highest scores: sparse_data = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',] numeric_data = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE','BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] The correlation matrix does show some high correlations between bill amount variables, however due to this being a credit bureau history dataset, and we are trying to predict a financial result - next payment default, payment amounts and bill amounts are the heart of the dataset and crucial to predictions and I believe removal would not make sense. I did attempt at removing outliers to improve score but there was no increase in the scores using the below script. print(credit_data.shape) credit_data['LIMIT_BAL z-score'] = stats.zscore(credit_data['LIMIT_BAL']) credit_data['BILL_AMT1 z-score'] = stats.zscore(credit_data['BILL_AMT1']) credit_data['BILL_AMT2 z-score'] = stats.zscore(credit_data['BILL_AMT2']) credit_data['BILL_AMT3 z-score'] = stats.zscore(credit_data['BILL_AMT3']) credit_data['BILL_AMT4 z-score'] = stats.zscore(credit_data['BILL_AMT4']) credit_data['BILL_AMT5 z-score'] = stats.zscore(credit_data['BILL_AMT5']) credit_data['BILL_AMT6 z-score'] = stats.zscore(credit_data['BILL_AMT6']) credit_data['PAY_AMT1 z-score'] = stats.zscore(credit_data['PAY_AMT1']) credit_data['PAY_AMT2 z-score'] = stats.zscore(credit_data['PAY_AMT2']) credit_data['PAY_AMT3 z-score'] = stats.zscore(credit_data['PAY_AMT3']) credit_data['PAY_AMT4 z-score'] = stats.zscore(credit_data['PAY_AMT4']) credit_data['PAY_AMT5 z-score'] = stats.zscore(credit_data['PAY_AMT5']) credit_data['PAY_AMT6 z-score'] = stats.zscore(credit_data['PAY_AMT6']) credit_data = credit_data.loc[credit_data['LIMIT_BAL z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['BILL_AMT1 z-score'].abs() <= 3] credit_data= credit_data.loc[credit_data['BILL_AMT2 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['BILL_AMT3 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['BILL_AMT4 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['BILL_AMT5 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['BILL_AMT6 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['PAY_AMT1 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['PAY_AMT2 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['PAY_AMT3 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['PAY_AMT4 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['PAY_AMT5 z-score'].abs() <= 3] credit_data = credit_data.loc[credit_data['PAY_AMT6 z-score'].abs() <= 3] print(credit_data.shape) credit_data = credit_data.drop(columns=['LIMIT_BAL z-score', 'BILL_AMT1 z-score', 'BILL_AMT2 z-score', 'BILL_AMT3 z-score', 'BILL_AMT4 z-score', 'BILL_AMT5 z-score', 'BILL_AMT6 z-score', 'PAY_AMT1 z-score', 'PAY_AMT2 z-score', 'PAY_AMT3 z-score', 'PAY_AMT4 z-score', 'PAY_AMT5 z-score', 'PAY_AMT6 z-score']) print(credit_data.shape) The end...
#Testing area below for differant configurations of classification models
from sklearn.preprocessing import MinMaxScaler #
from sklearn.preprocessing import RobustScaler #if data contains many outliers this can handle outliers better then others
from sklearn.preprocessing import MaxAbsScaler
numeric_transformer = Pipeline(steps=[('scaler', MaxAbsScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features),])
model_rfc2 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier_rfc', rfc(n_estimators=10))
])
model_knc2 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier_knc', knc(n_neighbors=10))
])
#RandomForestClassifier
ml_rfc2 = model_rfc2.fit(X_train2, np.ravel(y_train2))
y_pred2 = ml_rfc2.predict(X_test2)
#KNeighborsClassifier
ml_knc2 = model_knc2.fit(X_train2, np.ravel(y_train2))
y_pred3 = ml_knc2.predict(X_test2)
params = {'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]}
gs_rfc2 = GridSearchCV(estimator = ml_rfc2, param_grid=params,scoring='roc_auc',cv=5,refit=True,n_jobs=-1,verbose=3)
params = {'classifier_knc__n_neighbors': [3, 5, 10, 20]}
gs_knc2 = GridSearchCV(estimator = ml_knc2, param_grid=params,scoring='roc_auc', verbose=3, cv=5, refit=True, n_jobs=-1)
gs_rfc2.fit(X=X2, y=np.ravel(y2))
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 25 out of 25 | elapsed: 8.7s finished
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
MaxAbsScaler())]),
['LIMIT_BAL',
'SEX',
'EDUCATION',
'MARRIAGE',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT3',
'PAY_AMT4',
'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_rfc',
RandomForestClassifier(n_estimators=10))]),
n_jobs=-1,
param_grid={'classifier_rfc__n_estimators': [4, 5, 10, 20, 50]},
scoring='roc_auc', verbose=3)
gs_knc2.fit(X=X2, y=np.ravel(y2))
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 22.3s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 22.3s finished
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
MaxAbsScaler())]),
['LIMIT_BAL',
'SEX',
'EDUCATION',
'MARRIAGE',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT3',
'PAY_AMT4',
'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_knc',
KNeighborsClassifier(n_neighbors=10))]),
n_jobs=-1,
param_grid={'classifier_knc__n_neighbors': [3, 5, 10, 20]},
scoring='roc_auc', verbose=3)
print(gs_rfc2.best_params_)
print("\n",gs_rfc2.best_estimator_)
{'classifier_rfc__n_estimators': 50}
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
MaxAbsScaler())]),
['LIMIT_BAL', 'SEX',
'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5',
'PAY_6', 'BILL_AMT1',
'BILL_AMT2', 'BILL_AMT3',
'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3',
'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_rfc', RandomForestClassifier(n_estimators=50))])
print(gs_knc2.best_params_)
print("\n",gs_knc2.best_estimator_)
{'classifier_knc__n_neighbors': 20}
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
MaxAbsScaler())]),
['LIMIT_BAL', 'SEX',
'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5',
'PAY_6', 'BILL_AMT1',
'BILL_AMT2', 'BILL_AMT3',
'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3',
'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])),
('classifier_knc', KNeighborsClassifier(n_neighbors=20))])
rfc2_final_model = gs_rfc2.best_estimator_
rfc2_final_model.score(X_test2, y_test2)
0.9986637715049274
knc2_final_model = gs_knc2.best_estimator_
knc2_final_model.score(X_test2, y_test2)
0.8277935526975113
classifiers = [
rfc2_final_model,
knc2_final_model,
]
classifier_names = [
'Random Forest Classifier 1 using StandardScaler',
'KNeighbors Classifier 1 using StandardScaler',
]
for clf, clf_name in zip(classifiers, classifier_names):
cv_scores = cross_val_score(clf, X_test2, np.ravel(y_test2), cv=5)
print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')
Random Forest Classifier 1 using StandardScaler mean accuracy: 82.195 % std: 0.011 % KNeighbors Classifier 1 using StandardScaler mean accuracy: 81.259 % std: 0.005 %
rfc2_final_predict = rfc2_final_model.predict(X_test2)
print("Accuracy of RandomForestClassifier: {}%".format(round(accuracy_score(y_test2, rfc2_final_predict)*100, 2)))
plt.title('Confusion matrix for RandomForestClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, rfc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, rfc2_final_predict,digits=6))
Accuracy of RandomForestClassifier: 99.87%
precision recall f1-score support
0 0.998503 0.999786 0.999144 4671
1 0.999237 0.994681 0.996954 1316
accuracy 0.998664 5987
macro avg 0.998870 0.997233 0.998049 5987
weighted avg 0.998665 0.998664 0.998663 5987
knc2_final_predict = knc2_final_model.predict(X_test2)
print("Accuracy of KNeighborsClassifier: {}%".format(round(accuracy_score(y_test2, knc2_final_predict)*100, 2)))
plt.title('Confusion matrix for KNeighborsClassifier using StandardScaler')
sns.heatmap(confusion_matrix(y_test2, knc2_final_predict), annot=True, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test2, knc2_final_predict,digits=6))
Accuracy of KNeighborsClassifier: 82.78%
precision recall f1-score support
0 0.840824 0.961250 0.897013 4671
1 0.720247 0.354103 0.474783 1316
accuracy 0.827794 5987
macro avg 0.780536 0.657677 0.685898 5987
weighted avg 0.814320 0.827794 0.804203 5987