#Standard library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
#Sklearn imports
from scipy import stats
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#Sklearn Scoring imports
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
random_seed=123
#Rendering imports
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
from IPython.display import SVG
from graphviz import Source
score_keeper = pd.DataFrame(columns=['model','accuracy','precision','recall','F1', 'ROC'])
score_keeper
| model | accuracy | precision | recall | F1 | ROC |
|---|
#Function to keep a record of all the models scoring.
def logistic_scoring(y_test, y_pred, classifier_name, score_keeper):
roc = format(round(roc_auc_score(y_test, y_pred)*100, 2))
acc = format(round(accuracy_score(y_test, y_pred)*100, 2))
prec = format(round(precision_score(y_test, y_pred)*100, 2))
rec = format(round(recall_score(y_test, y_pred)*100, 2))
f1 = format(round(f1_score(y_test, y_pred)*100, 2))
results = {'model':classifier_name, 'accuracy':acc, 'precision':prec, 'recall':rec, 'F1':f1, 'ROC':roc}
res = score_keeper.isin([classifier_name, acc, prec, rec, f1, roc]).any()
if res.any():
pass
else:
score_keeper = score_keeper.append(results, ignore_index=True)
return score_keeper
#path = ("C:/Users/Lenovo\Desktop\Data Science/U of T - Data Science\Data Science 4 - Machine Learning/Assignment 1 - Classification/default of credit card clients.xls")
path = ("C:/Users/Lenovo/Desktop/Data Science/U of T - Data Science/Data Science 4 - Machine Learning/Assignment 4 - Decision Trees/default of credit card clients.xls")
credit_data = pd.read_excel(path)
credit_data.columns = credit_data.iloc[0]
credit_data.drop(labels='ID', inplace=True, axis=1)
credit_data.drop(index=credit_data.index[0], axis=0,inplace=True)
credit_data.rename(columns={'default payment next month':'next_pmt_default'}, inplace=True)
credit_data.head(2)
| LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | next_pmt_default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | -2 | ... | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | 0 | ... | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
2 rows × 24 columns
credit_data = credit_data.astype(np.int64)
credit_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 30000 entries, 1 to 30000 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LIMIT_BAL 30000 non-null int64 1 SEX 30000 non-null int64 2 EDUCATION 30000 non-null int64 3 MARRIAGE 30000 non-null int64 4 AGE 30000 non-null int64 5 PAY_0 30000 non-null int64 6 PAY_2 30000 non-null int64 7 PAY_3 30000 non-null int64 8 PAY_4 30000 non-null int64 9 PAY_5 30000 non-null int64 10 PAY_6 30000 non-null int64 11 BILL_AMT1 30000 non-null int64 12 BILL_AMT2 30000 non-null int64 13 BILL_AMT3 30000 non-null int64 14 BILL_AMT4 30000 non-null int64 15 BILL_AMT5 30000 non-null int64 16 BILL_AMT6 30000 non-null int64 17 PAY_AMT1 30000 non-null int64 18 PAY_AMT2 30000 non-null int64 19 PAY_AMT3 30000 non-null int64 20 PAY_AMT4 30000 non-null int64 21 PAY_AMT5 30000 non-null int64 22 PAY_AMT6 30000 non-null int64 23 next_pmt_default 30000 non-null int64 dtypes: int64(24) memory usage: 5.7 MB
credit_data.describe
<bound method NDFrame.describe of 0 LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 \ 1 20000 2 2 1 24 2 2 -1 -1 2 120000 2 2 2 26 -1 2 0 0 3 90000 2 2 2 34 0 0 0 0 4 50000 2 2 1 37 0 0 0 0 5 50000 1 2 1 57 -1 0 -1 0 ... ... ... ... ... ... ... ... ... ... 29996 220000 1 3 1 39 0 0 0 0 29997 150000 1 3 2 43 -1 -1 -1 -1 29998 30000 1 2 2 37 4 3 2 -1 29999 80000 1 3 1 41 1 -1 0 0 30000 50000 1 2 1 46 0 0 0 0 0 PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 \ 1 -2 ... 0 0 0 0 689 2 0 ... 3272 3455 3261 0 1000 3 0 ... 14331 14948 15549 1518 1500 4 0 ... 28314 28959 29547 2000 2019 5 0 ... 20940 19146 19131 2000 36681 ... ... ... ... ... ... ... ... 29996 0 ... 88004 31237 15980 8500 20000 29997 0 ... 8979 5190 0 1837 3526 29998 0 ... 20878 20582 19357 0 0 29999 0 ... 52774 11855 48944 85900 3409 30000 0 ... 36535 32428 15313 2078 1800 0 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 next_pmt_default 1 0 0 0 0 1 2 1000 1000 0 2000 1 3 1000 1000 1000 5000 0 4 1200 1100 1069 1000 0 5 10000 9000 689 679 0 ... ... ... ... ... ... 29996 5003 3047 5000 1000 0 29997 8998 129 0 0 0 29998 22000 4200 2000 3100 1 29999 1178 1926 52964 1804 1 30000 1430 1000 1000 1000 1 [30000 rows x 24 columns]>
credit_data.isna().sum()
0 LIMIT_BAL 0 SEX 0 EDUCATION 0 MARRIAGE 0 AGE 0 PAY_0 0 PAY_2 0 PAY_3 0 PAY_4 0 PAY_5 0 PAY_6 0 BILL_AMT1 0 BILL_AMT2 0 BILL_AMT3 0 BILL_AMT4 0 BILL_AMT5 0 BILL_AMT6 0 PAY_AMT1 0 PAY_AMT2 0 PAY_AMT3 0 PAY_AMT4 0 PAY_AMT5 0 PAY_AMT6 0 next_pmt_default 0 dtype: int64
credit_data[credit_data.EDUCATION == 0].shape[0]
credit_data[credit_data.EDUCATION == 0].index.tolist()
drop_list1 = credit_data[credit_data.EDUCATION == 0].index.tolist()
credit_data[credit_data.MARRIAGE == 0].shape[0]
credit_data[credit_data.MARRIAGE == 0].index.tolist()
drop_list2 = credit_data[credit_data.MARRIAGE == 0].index.tolist()
drop_list_final = drop_list1 + drop_list2
credit_data = credit_data.drop(index=drop_list_final)
credit_data.shape
(29932, 24)
features = list(credit_data.iloc[:,0:23].columns.values)
features
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target_cols = credit_data.iloc[:, -1:].columns.values
target_cols
array(['next_pmt_default'], dtype=object)
X, y = credit_data[features], credit_data[['next_pmt_default']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
X_train
| LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | ... | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8848 | 200000 | 1 | 1 | 2 | 33 | -1 | -1 | -2 | -1 | -1 | ... | -60 | 1740 | 0 | 0 | 0 | 0 | 1800 | 0 | 0 | 0 |
| 22959 | 210000 | 2 | 2 | 1 | 46 | 0 | 0 | 0 | 0 | 0 | ... | 111335 | 114037 | 67224 | 69177 | 6149 | 4100 | 4547 | 2500 | 3000 | 2500 |
| 5881 | 190000 | 1 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | ... | 22117 | 23433 | 22173 | 22598 | 1807 | 2000 | 2010 | 1003 | 2007 | 3019 |
| 8575 | 150000 | 2 | 1 | 1 | 32 | 1 | -2 | -2 | -2 | -2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 28250 | 150000 | 2 | 1 | 1 | 33 | 1 | -1 | -1 | -1 | 0 | ... | 0 | 863 | 863 | 0 | 353 | 0 | 863 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28702 | 140000 | 2 | 2 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | ... | 84235 | 82432 | 81465 | 83160 | 4000 | 4010 | 3800 | 3200 | 3100 | 3000 |
| 17767 | 130000 | 2 | 3 | 2 | 55 | 0 | 0 | 0 | 0 | 0 | ... | 128243 | 115204 | 101158 | 95895 | 10063 | 20000 | 10009 | 10000 | 5000 | 5000 |
| 28093 | 130000 | 2 | 3 | 2 | 46 | 0 | 0 | 0 | 0 | 0 | ... | 132927 | 131866 | 130707 | 129174 | 5000 | 5000 | 4600 | 5000 | 4614 | 5139 |
| 15755 | 200000 | 2 | 3 | 2 | 25 | 0 | 0 | 0 | 0 | -1 | ... | 9853 | 5334 | 5481 | 11344 | 1343 | 5033 | 3516 | 5497 | 11378 | 12774 |
| 20009 | 200000 | 2 | 1 | 2 | 31 | 0 | 0 | 0 | 0 | 0 | ... | 104382 | 91888 | 88267 | 85471 | 2768 | 48617 | 3243 | 3000 | 3233 | 3000 |
23945 rows × 23 columns
#Imports for pipeline construction and data transformation
from sklearn.pipeline import Pipeline #to store the steps of transformers and estimators
from sklearn.compose import ColumnTransformer #for data that is formatted into panda column format
from sklearn.preprocessing import StandardScaler #may perform badly if data is not normally distributed
from sklearn.preprocessing import MinMaxScaler #
from sklearn.preprocessing import RobustScaler #if data contains many outliers this can handle outliers better then others
from sklearn.preprocessing import MaxAbsScaler #preserves zero entries in sparse data, specifically designed for scaling sparse data
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features),])
tree_clf = Pipeline(steps=[('preprocessor', preprocessor),
('decision_tree_classifier', dtc())
])
tree_clf.fit(X_train, np.ravel(y_train))
y_pred = tree_clf.predict(X_test)
score_keeper = logistic_scoring(y_test, y_pred, 'DecisionTreeClassifer before hyperparameter tuning', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
print("Accuracy of tree classifier before hyperparameter tuning: {}%".format(round(accuracy_score(y_test, y_pred)*100, 2)))
Accuracy of tree classifier before hyperparameter tuning: 73.81%
tree_clf.get_params().keys()
dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'decision_tree_classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__scaler', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'decision_tree_classifier__ccp_alpha', 'decision_tree_classifier__class_weight', 'decision_tree_classifier__criterion', 'decision_tree_classifier__max_depth', 'decision_tree_classifier__max_features', 'decision_tree_classifier__max_leaf_nodes', 'decision_tree_classifier__min_impurity_decrease', 'decision_tree_classifier__min_impurity_split', 'decision_tree_classifier__min_samples_leaf', 'decision_tree_classifier__min_samples_split', 'decision_tree_classifier__min_weight_fraction_leaf', 'decision_tree_classifier__presort', 'decision_tree_classifier__random_state', 'decision_tree_classifier__splitter'])
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
parameters = {'decision_tree_classifier__max_depth':[2,4,6,8,10,12],
'decision_tree_classifier__min_samples_leaf':[2,4,6,8,10,12],
'decision_tree_classifier__min_samples_split':[2,4,6,8,10,12],
'decision_tree_classifier__criterion':['gini','entropy'],
}
grid_search_tree = GridSearchCV(estimator=tree_clf,
param_grid=parameters,
scoring = 'accuracy',
cv=10,
verbose=3,
refit=True,
n_jobs=-1)
grid_search_tree.fit(X=X_train,y=np.ravel(y_train))
Fitting 10 folds for each of 432 candidates, totalling 4320 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 24 tasks | elapsed: 1.1s [Parallel(n_jobs=-1)]: Done 120 tasks | elapsed: 5.5s [Parallel(n_jobs=-1)]: Done 280 tasks | elapsed: 12.1s [Parallel(n_jobs=-1)]: Done 504 tasks | elapsed: 25.9s [Parallel(n_jobs=-1)]: Done 792 tasks | elapsed: 45.8s [Parallel(n_jobs=-1)]: Done 1144 tasks | elapsed: 1.3min [Parallel(n_jobs=-1)]: Done 1560 tasks | elapsed: 2.1min [Parallel(n_jobs=-1)]: Done 2040 tasks | elapsed: 3.2min [Parallel(n_jobs=-1)]: Done 2584 tasks | elapsed: 3.9min [Parallel(n_jobs=-1)]: Done 3192 tasks | elapsed: 4.9min [Parallel(n_jobs=-1)]: Done 3864 tasks | elapsed: 6.8min [Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 8.4min finished
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL',
'SEX',
'EDUCATION',
'MARRIAGE',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT3',
'PA...
'PAY_AMT6'])])),
('decision_tree_classifier',
DecisionTreeClassifier())]),
n_jobs=-1,
param_grid={'decision_tree_classifier__criterion': ['gini',
'entropy'],
'decision_tree_classifier__max_depth': [2, 4, 6, 8, 10,
12],
'decision_tree_classifier__min_samples_leaf': [2, 4, 6,
8, 10,
12],
'decision_tree_classifier__min_samples_split': [2, 4,
6, 8,
10,
12]},
scoring='accuracy', verbose=3)grid_search_tree.best_params_
{'decision_tree_classifier__criterion': 'entropy',
'decision_tree_classifier__max_depth': 4,
'decision_tree_classifier__min_samples_leaf': 12,
'decision_tree_classifier__min_samples_split': 2}final_model_tree = grid_search_tree.best_estimator_
final_model_tree.fit(X=X_train,y=np.ravel(y_train))
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL', 'SEX',
'EDUCATION', 'MARRIAGE',
'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5',
'PAY_6', 'BILL_AMT1',
'BILL_AMT2', 'BILL_AMT3',
'BILL_AMT4', 'BILL_AMT5',
'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3',
'PAY_AMT4', 'PAY_AMT5',
'PAY_AMT6'])])),
('decision_tree_classifier',
DecisionTreeClassifier(criterion='entropy', max_depth=4,
min_samples_leaf=12))])final_model_tree.score(X_test, y_test)
0.8321362953064975
y_pred = final_model_tree.predict(X_test)
classifiers = [
final_model_tree,
]
classifier_names = [
'Decision Tree Classifier using StandardScaler',
]
for clf, clf_name in zip(classifiers, classifier_names):
cv_scores = cross_val_score(clf, X_test, np.ravel(y_test), cv=100)
print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')
Decision Tree Classifier using StandardScaler mean accuracy: 82.875 % std: 0.151 %
score_keeper = logistic_scoring(y_test, y_pred, 'DecisionTreeClassifer after hyperparameter tuning', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
print("Accuracy of DecisionTreeClassifier: {}%".format(round(accuracy_score(y_test, y_pred)*100, 2)))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, linewidths=1, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test, y_pred,digits=6))
Accuracy of DecisionTreeClassifier: 83.21%
precision recall f1-score support
0 0.846503 0.958681 0.899107 4671
1 0.723099 0.382979 0.500745 1316
accuracy 0.832136 5987
macro avg 0.784801 0.670830 0.699926 5987
weighted avg 0.819377 0.832136 0.811543 5987
from sklearn.metrics import plot_precision_recall_curve
disp = plot_precision_recall_curve(final_model_tree, X_test, y_test)
disp
<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x20f53ef1f70>
Try to build the decision tree which you built for the previous question, but this time by RandomizedSearchCV over hyper-parameters. Compare the results.
The scores are exactly the same, the major difference being the timing. RandomzedGridSearchCV takes 9.9 seconds to complete versus 8.4 minutes for the standard GridSearchCV. So there is a major processing time differance for the same scoring results.
param_dist = {'decision_tree_classifier__max_depth':[2,4,6,8,10,12],
'decision_tree_classifier__min_samples_leaf':[2,4,6,8,10,12],
'decision_tree_classifier__min_samples_split':[2,4,6,8,10,12],
'decision_tree_classifier__criterion':['gini','entropy']
}
clf = RandomizedSearchCV(tree_clf, param_dist, refit=True, n_jobs=-1, verbose=3, cv=10)
clf.fit(X=X_train,y=np.ravel(y_train))
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 24 tasks | elapsed: 4.3s [Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 14.1s finished
RandomizedSearchCV(cv=10,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
['LIMIT_BAL',
'SEX',
'EDUCATION',
'MARRIAGE',
'AGE',
'PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT...
'PAY_AMT5',
'PAY_AMT6'])])),
('decision_tree_classifier',
DecisionTreeClassifier())]),
n_jobs=-1,
param_distributions={'decision_tree_classifier__criterion': ['gini',
'entropy'],
'decision_tree_classifier__max_depth': [2,
4,
6,
8,
10,
12],
'decision_tree_classifier__min_samples_leaf': [2,
4,
6,
8,
10,
12],
'decision_tree_classifier__min_samples_split': [2,
4,
6,
8,
10,
12]},
verbose=3)clf.best_params_
{'decision_tree_classifier__min_samples_split': 6,
'decision_tree_classifier__min_samples_leaf': 4,
'decision_tree_classifier__max_depth': 4,
'decision_tree_classifier__criterion': 'entropy'}final_model_rand = clf.best_estimator_
rand_predict = final_model_rand.predict(X_test)
score_keeper = logistic_scoring(y_test, rand_predict, 'DecisionTreeClassifer using RandomGridSearchCV', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
Part 1: Try to build the same classifier by using following ensemble models.
Part 2: For each of these models calculate accuracy and at least for two in the list below, plot the learning curves.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
credit_data.shape
(29932, 24)
X, y = credit_data[features].values, credit_data[['next_pmt_default']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
X_train
array([[200000, 1, 1, ..., 0, 0, 0],
[210000, 2, 2, ..., 2500, 3000, 2500],
[190000, 1, 1, ..., 1003, 2007, 3019],
...,
[130000, 2, 3, ..., 5000, 4614, 5139],
[200000, 2, 3, ..., 5497, 11378, 12774],
[200000, 2, 1, ..., 3000, 3233, 3000]], dtype=int64)numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
dtc = Pipeline(steps=[('scaler', numeric_transformer), ('dtc_classifier', dtc(max_depth=10))])
dtc.fit(X_train, np.ravel(y_train))
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8196091531651912
score_keeper = logistic_scoring(y_test, y_pred, 'DecisionTreeClassifer with StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
rfc = Pipeline(steps=[('scaler', numeric_transformer), ('RandomForestClassifier', RandomForestClassifier())])
rfc.fit(X_train, np.ravel(y_train))
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8257892099549022
score_keeper = logistic_scoring(y_test, y_pred, 'RandomForestClassifer with StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
bag_clf = Pipeline(steps=[('scaler', numeric_transformer),
('bag_classifier', BaggingClassifier(base_estimator=dtc(max_depth=10),\
n_estimators=200, max_samples=90, bootstrap=True, n_jobs=-1))])
bag_clf.fit(X_train, np.ravel(y_train))
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8262902956405546
score_keeper = logistic_scoring(y_test, y_pred, 'Bagging Classifier with decisionTreeClassifer StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
bag_rfc_clf = Pipeline(steps=[('scaler_rfc', numeric_transformer),
('bag_classifier_rfc', BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1),\
n_estimators=200, max_samples=90, bootstrap=True, n_jobs=-1))])
bag_rfc_clf.fit(X_train, np.ravel(y_train))
y_pred = bag_rfc_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8107566393853349
score_keeper = logistic_scoring(y_test, y_pred, 'Bagging Classifier with RandomForestClassifer StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
| 6 | Bagging Classifier with RandomForestClassifer ... | 81.08 | 74.53 | 21.12 | 32.92 | 59.55 |
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
ada_clf = Pipeline(steps=[('scaler', numeric_transformer),
('adaboost_classifier', AdaBoostClassifier(dtc(max_depth=10),n_estimators=200, algorithm="SAMME.R", learning_rate=0.9))])
ada_clf.fit(X_train, np.ravel(y_train))
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8019041256054785
score_keeper = logistic_scoring(y_test, y_pred, 'Adaboost Classifier with DecisionTreeClassifier StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
| 6 | Bagging Classifier with RandomForestClassifer ... | 81.08 | 74.53 | 21.12 | 32.92 | 59.55 |
| 7 | Adaboost Classifier with DecisionTreeClassifie... | 80.19 | 58.33 | 34.57 | 43.42 | 63.81 |
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
ada_rfc_clf = Pipeline(steps=[('scaler', numeric_transformer),('adaboost_classifier', AdaBoostClassifier(RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1),n_estimators=200, algorithm="SAMME.R", learning_rate=0.9))])
ada_rfc_clf.fit(X_train, np.ravel(y_train))
y_pred = ada_rfc_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8229497244028728
score_keeper = logistic_scoring(y_test, y_pred, 'AdaboostClassifier with RandomForestClassifer StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
| 6 | Bagging Classifier with RandomForestClassifer ... | 81.08 | 74.53 | 21.12 | 32.92 | 59.55 |
| 7 | Adaboost Classifier with DecisionTreeClassifie... | 80.19 | 58.33 | 34.57 | 43.42 | 63.81 |
| 8 | AdaboostClassifier with RandomForestClassifer ... | 82.29 | 67.58 | 37.39 | 48.14 | 66.17 |
forest = ExtraTreesClassifier(n_estimators=10)
forest.fit(X_train, np.ravel(y_train))
y_pred = forest.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8182729246701186
score_keeper = logistic_scoring(y_test, y_pred, 'ExtraTreesClassifier alone no transformers', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
| 6 | Bagging Classifier with RandomForestClassifer ... | 81.08 | 74.53 | 21.12 | 32.92 | 59.55 |
| 7 | Adaboost Classifier with DecisionTreeClassifie... | 80.19 | 58.33 | 34.57 | 43.42 | 63.81 |
| 8 | AdaboostClassifier with RandomForestClassifer ... | 82.29 | 67.58 | 37.39 | 48.14 | 66.17 |
| 9 | ExtraTreesClassifier alone no transformers | 81.83 | 65.79 | 36.09 | 46.61 | 65.4 |
rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, np.ravel(y_train))
y_pred = rnd_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8259562385167863
score_keeper = logistic_scoring(y_test, y_pred, 'RandomForestClassifier alone no transformers', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
| 6 | Bagging Classifier with RandomForestClassifer ... | 81.08 | 74.53 | 21.12 | 32.92 | 59.55 |
| 7 | Adaboost Classifier with DecisionTreeClassifie... | 80.19 | 58.33 | 34.57 | 43.42 | 63.81 |
| 8 | AdaboostClassifier with RandomForestClassifer ... | 82.29 | 67.58 | 37.39 | 48.14 | 66.17 |
| 9 | ExtraTreesClassifier alone no transformers | 81.83 | 65.79 | 36.09 | 46.61 | 65.4 |
| 10 | RandomForestClassifier alone no transformers | 82.6 | 73.22 | 32.83 | 45.33 | 64.72 |
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
gbc = Pipeline(steps=[('scaler', numeric_transformer), ('GradientBoostingClassifier',
GradientBoostingClassifier(n_estimators=200, learning_rate=1, max_features=10, max_depth=10, random_state=42))])
gbc.fit(X_train, np.ravel(y_train))
y_pred = gbc.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.8039084683480875
score_keeper = logistic_scoring(y_test, y_pred, 'GradientBoosting w StandardScaler', score_keeper)
score_keeper
| model | accuracy | precision | recall | F1 | ROC | |
|---|---|---|---|---|---|---|
| 0 | DecisionTreeClassifer before hyperparameter tu... | 73.81 | 40.9 | 43.01 | 41.93 | 62.75 |
| 1 | DecisionTreeClassifer after hyperparameter tuning | 83.21 | 72.31 | 38.3 | 50.07 | 67.08 |
| 2 | DecisionTreeClassifer using RandomGridSearchCV | 83.13 | 71.49 | 38.68 | 50.2 | 67.17 |
| 3 | DecisionTreeClassifer with StandardScaler | 81.96 | 64.75 | 39.36 | 48.96 | 66.66 |
| 4 | RandomForestClassifer with StandardScaler | 82.58 | 67.8 | 39.51 | 49.93 | 67.11 |
| 5 | Bagging Classifier with decisionTreeClassifer ... | 82.63 | 71.56 | 34.8 | 46.83 | 65.45 |
| 6 | Bagging Classifier with RandomForestClassifer ... | 81.08 | 74.53 | 21.12 | 32.92 | 59.55 |
| 7 | Adaboost Classifier with DecisionTreeClassifie... | 80.19 | 58.33 | 34.57 | 43.42 | 63.81 |
| 8 | AdaboostClassifier with RandomForestClassifer ... | 82.29 | 67.58 | 37.39 | 48.14 | 66.17 |
| 9 | ExtraTreesClassifier alone no transformers | 81.83 | 65.79 | 36.09 | 46.61 | 65.4 |
| 10 | RandomForestClassifier alone no transformers | 82.6 | 73.22 | 32.83 | 45.33 | 64.72 |
| 11 | GradientBoosting w StandardScaler | 80.39 | 57.89 | 39.59 | 47.02 | 65.74 |
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
X, y = credit_data[features].values, credit_data[['next_pmt_default']].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
# 1)
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
bag_pipe = Pipeline(steps=[('scaler', numeric_transformer),
('bag_classifier', BaggingClassifier(base_estimator=dtc(max_depth=10),\
n_estimators=200, max_samples=90, bootstrap=True, n_jobs=-1))])
# 2)
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
rfc_pipe = Pipeline(steps=[('scaler', numeric_transformer), ('RandomForestClassifier', RandomForestClassifier())])
# Plot Learning Curve Function Courtesy of the SKLEARN Documentation def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): if axes is None: _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title(title) if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = \ learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes[0].legend(loc="best") # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, 'o-') axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() axes[2].plot(fit_times_mean, test_scores_mean, 'o-') axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") return plt fig, axes = plt.subplots(3, 2, figsize=(10, 15)) #X, y = load_digits(return_X_y=True) #BaggingClassifier with DescisionTreeClassifier and StandardScaler Pipeline title = "Learning Curves (BaggingClassifier w DecisionTree)" cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42) estimator = bag_pipe plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=-1) #RandomForestClassifier and StandardScaler Pipeline title = r"Learning Curves (RandomForestClassifier)" cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42) estimator = rfc_pipe plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=-1) plt.show()Discuss and compare the results for the all past three questions.
Part 1: Changing the hyperparameters will usually lead to increased scores and increased processing times. In question 1 doing a hyperparameter search increased the accuracy score of the DecisionTreeClassifier by 10%, it went from 73.18 to 83.21. The Randomized grid search is significantly faster then the standard gridsearch with very similiar scoring.
Part 2: Certain models perform better or worse depending on the data fed into them. Preprocessing data properly is very important, removing zero values, proper scaling, and normalization all contribute to these models succesful predictions. Some models are more sensitive to having outliers, some are more sensitive to non normalised data etc. Building proper data transformation pipelines tailored to each features data type is crucial.
Part 3: The random forest classifier is know for performing better then some of the classical algorithms and didnt disappoint in these cases, as it had one of the highest scores without any data transformations, boosters, or baggers. The decision tree classifier performed very poorly without hyperparameter tuning so that would imply it is very hyper parameter tuning dependant.
X, y = credit_data[features], credit_data[['next_pmt_default']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
dtc_clf = dtc(max_depth=3, criterion='entropy') dtc_clf.fit(X, y)DecisionTreeClassifier(criterion='entropy', max_depth=3)
tree.plot_tree(dtc_clf);#dot_data = export_graphviz(tree_clf, filled=True, rounded=True, class_names=[],feature_names=feat_cols, out_file=None)graph = Source(tree.export_graphviz(dtc_clf, filled = True)) display(SVG(graph.pipe(format='svg')))
Welcome to JGAnalytics, we offer services for hire for Data Analytics, Data Engineering, and Machine Learning.
Copyright © JGAnalytics 2024