course4_assignment4_completed

Project # 4 - Decision Trees, Ensemble Methods and Grid Searches¶

Justin Gabb¶

In [344]:

#Standard library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#Sklearn imports
from scipy import stats
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#Sklearn Scoring imports
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
random_seed=123

#Rendering imports
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
from IPython.display import SVG
from graphviz import Source

In [345]:

score_keeper = pd.DataFrame(columns=['model','accuracy','precision','recall','F1', 'ROC'])
score_keeper

Out[345]:

	model	accuracy	precision	recall	F1	ROC

In [237]:

#Function to keep a record of all the models scoring.

def logistic_scoring(y_test, y_pred, classifier_name, score_keeper):
    roc = format(round(roc_auc_score(y_test, y_pred)*100, 2))
    acc = format(round(accuracy_score(y_test, y_pred)*100, 2))
    prec = format(round(precision_score(y_test, y_pred)*100, 2))
    rec = format(round(recall_score(y_test, y_pred)*100, 2))
    f1 = format(round(f1_score(y_test, y_pred)*100, 2))
    results = {'model':classifier_name, 'accuracy':acc, 'precision':prec, 'recall':rec, 'F1':f1, 'ROC':roc}
    res = score_keeper.isin([classifier_name, acc, prec, rec, f1, roc]).any()
    if res.any():
        pass
    else:
        score_keeper = score_keeper.append(results, ignore_index=True)
    return score_keeper

In [268]:

#path = ("C:/Users/Lenovo\Desktop\Data Science/U of T - Data Science\Data Science 4 - Machine Learning/Assignment 1 - Classification/default of credit card clients.xls")
path = ("C:/Users/Lenovo/Desktop/Data Science/U of T - Data Science/Data Science 4 - Machine Learning/Assignment 4 - Decision Trees/default of credit card clients.xls")
credit_data = pd.read_excel(path)

Questions (15 points total)¶

Question 1 (2 pts)¶

Build a classifier by using decision tree and calculate the confusion matrix. Try different hyper-parameters (at least two) and discuss the result.

Answer - See Below:¶

Data Preperation and Cleaning¶

In [269]:

credit_data.columns = credit_data.iloc[0]

credit_data.drop(labels='ID', inplace=True, axis=1)

credit_data.drop(index=credit_data.index[0], axis=0,inplace=True)

credit_data.rename(columns={'default payment next month':'next_pmt_default'}, inplace=True)

credit_data.head(2)

Out[269]:

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	next_pmt_default
1	20000	2	2	1	24	2	2	-1	-1	-2	...	0	0	0	0	689	0	0	0	0	1
2	120000	2	2	2	26	-1	2	0	0	0	...	3272	3455	3261	0	1000	1000	1000	0	2000	1

2 rows × 24 columns

In [270]:

credit_data = credit_data.astype(np.int64)

In [271]:

credit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   LIMIT_BAL         30000 non-null  int64
 1   SEX               30000 non-null  int64
 2   EDUCATION         30000 non-null  int64
 3   MARRIAGE          30000 non-null  int64
 4   AGE               30000 non-null  int64
 5   PAY_0             30000 non-null  int64
 6   PAY_2             30000 non-null  int64
 7   PAY_3             30000 non-null  int64
 8   PAY_4             30000 non-null  int64
 9   PAY_5             30000 non-null  int64
 10  PAY_6             30000 non-null  int64
 11  BILL_AMT1         30000 non-null  int64
 12  BILL_AMT2         30000 non-null  int64
 13  BILL_AMT3         30000 non-null  int64
 14  BILL_AMT4         30000 non-null  int64
 15  BILL_AMT5         30000 non-null  int64
 16  BILL_AMT6         30000 non-null  int64
 17  PAY_AMT1          30000 non-null  int64
 18  PAY_AMT2          30000 non-null  int64
 19  PAY_AMT3          30000 non-null  int64
 20  PAY_AMT4          30000 non-null  int64
 21  PAY_AMT5          30000 non-null  int64
 22  PAY_AMT6          30000 non-null  int64
 23  next_pmt_default  30000 non-null  int64
dtypes: int64(24)
memory usage: 5.7 MB

In [272]:

credit_data.describe

Out[272]:

<bound method NDFrame.describe of 0      LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
1          20000    2          2         1   24      2      2     -1     -1   
2         120000    2          2         2   26     -1      2      0      0   
3          90000    2          2         2   34      0      0      0      0   
4          50000    2          2         1   37      0      0      0      0   
5          50000    1          2         1   57     -1      0     -1      0   
...          ...  ...        ...       ...  ...    ...    ...    ...    ...   
29996     220000    1          3         1   39      0      0      0      0   
29997     150000    1          3         2   43     -1     -1     -1     -1   
29998      30000    1          2         2   37      4      3      2     -1   
29999      80000    1          3         1   41      1     -1      0      0   
30000      50000    1          2         1   46      0      0      0      0   

0      PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  \
1         -2  ...          0          0          0         0       689   
2          0  ...       3272       3455       3261         0      1000   
3          0  ...      14331      14948      15549      1518      1500   
4          0  ...      28314      28959      29547      2000      2019   
5          0  ...      20940      19146      19131      2000     36681   
...      ...  ...        ...        ...        ...       ...       ...   
29996      0  ...      88004      31237      15980      8500     20000   
29997      0  ...       8979       5190          0      1837      3526   
29998      0  ...      20878      20582      19357         0         0   
29999      0  ...      52774      11855      48944     85900      3409   
30000      0  ...      36535      32428      15313      2078      1800   

0      PAY_AMT3  PAY_AMT4  PAY_AMT5  PAY_AMT6  next_pmt_default  
1             0         0         0         0                 1  
2          1000      1000         0      2000                 1  
3          1000      1000      1000      5000                 0  
4          1200      1100      1069      1000                 0  
5         10000      9000       689       679                 0  
...         ...       ...       ...       ...               ...  
29996      5003      3047      5000      1000                 0  
29997      8998       129         0         0                 0  
29998     22000      4200      2000      3100                 1  
29999      1178      1926     52964      1804                 1  
30000      1430      1000      1000      1000                 1  

[30000 rows x 24 columns]>

In [273]:

credit_data.isna().sum()

Out[273]:

0
LIMIT_BAL           0
SEX                 0
EDUCATION           0
MARRIAGE            0
AGE                 0
PAY_0               0
PAY_2               0
PAY_3               0
PAY_4               0
PAY_5               0
PAY_6               0
BILL_AMT1           0
BILL_AMT2           0
BILL_AMT3           0
BILL_AMT4           0
BILL_AMT5           0
BILL_AMT6           0
PAY_AMT1            0
PAY_AMT2            0
PAY_AMT3            0
PAY_AMT4            0
PAY_AMT5            0
PAY_AMT6            0
next_pmt_default    0
dtype: int64

In [274]:

credit_data[credit_data.EDUCATION == 0].shape[0]
credit_data[credit_data.EDUCATION == 0].index.tolist()
drop_list1 = credit_data[credit_data.EDUCATION == 0].index.tolist()

credit_data[credit_data.MARRIAGE == 0].shape[0]
credit_data[credit_data.MARRIAGE == 0].index.tolist()
drop_list2 = credit_data[credit_data.MARRIAGE == 0].index.tolist()

drop_list_final = drop_list1 + drop_list2

credit_data = credit_data.drop(index=drop_list_final)

In [275]:

credit_data.shape

Out[275]:

(29932, 24)

Data Splitting and Assignment¶

In [276]:

features = list(credit_data.iloc[:,0:23].columns.values)
features

Out[276]:

['LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

In [277]:

target_cols = credit_data.iloc[:, -1:].columns.values
target_cols

Out[277]:

array(['next_pmt_default'], dtype=object)

In [278]:

X, y = credit_data[features], credit_data[['next_pmt_default']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [279]:

X_train

Out[279]:

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	...	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6
8848	200000	1	1	2	33	-1	-1	-2	-1	-1	...	-60	1740	0	0	0	0	1800	0	0	0
22959	210000	2	2	1	46	0	0	0	0	0	...	111335	114037	67224	69177	6149	4100	4547	2500	3000	2500
5881	190000	1	1	2	35	0	0	0	0	0	...	22117	23433	22173	22598	1807	2000	2010	1003	2007	3019
8575	150000	2	1	1	32	1	-2	-2	-2	-2	...	0	0	0	0	0	0	0	0	0	0
28250	150000	2	1	1	33	1	-1	-1	-1	0	...	0	863	863	0	353	0	863	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
28702	140000	2	2	2	35	0	0	0	0	0	...	84235	82432	81465	83160	4000	4010	3800	3200	3100	3000
17767	130000	2	3	2	55	0	0	0	0	0	...	128243	115204	101158	95895	10063	20000	10009	10000	5000	5000
28093	130000	2	3	2	46	0	0	0	0	0	...	132927	131866	130707	129174	5000	5000	4600	5000	4614	5139
15755	200000	2	3	2	25	0	0	0	0	-1	...	9853	5334	5481	11344	1343	5033	3516	5497	11378	12774
20009	200000	2	1	2	31	0	0	0	0	0	...	104382	91888	88267	85471	2768	48617	3243	3000	3233	3000

23945 rows × 23 columns

In [280]:

#Imports for pipeline construction and data transformation

from sklearn.pipeline import Pipeline            #to store the steps of transformers and estimators
from sklearn.compose import ColumnTransformer    #for data that is formatted into panda column format
from sklearn.preprocessing import StandardScaler #may perform badly if data is not normally distributed
from sklearn.preprocessing import MinMaxScaler   #
from sklearn.preprocessing import RobustScaler   #if data contains many outliers this can handle outliers better then others
from sklearn.preprocessing import MaxAbsScaler   #preserves zero entries in sparse data, specifically designed for scaling sparse data

Pipeline for data transformation using standard scaler, pandas column transformer, and DecisionTreeClassifier¶

In [281]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, features),])

tree_clf = Pipeline(steps=[('preprocessor', preprocessor),
                            ('decision_tree_classifier', dtc())
                            ])

Model training and fitting¶

In [252]:

tree_clf.fit(X_train, np.ravel(y_train))
y_pred = tree_clf.predict(X_test)

In [253]:

score_keeper = logistic_scoring(y_test, y_pred, 'DecisionTreeClassifer before hyperparameter tuning', score_keeper)
score_keeper

Out[253]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75

In [254]:

print("Accuracy of tree classifier before hyperparameter tuning: {}%".format(round(accuracy_score(y_test, y_pred)*100, 2)))

Accuracy of tree classifier before hyperparameter tuning: 73.81%

In [255]:

tree_clf.get_params().keys()

Out[255]:

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'decision_tree_classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__scaler', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'decision_tree_classifier__ccp_alpha', 'decision_tree_classifier__class_weight', 'decision_tree_classifier__criterion', 'decision_tree_classifier__max_depth', 'decision_tree_classifier__max_features', 'decision_tree_classifier__max_leaf_nodes', 'decision_tree_classifier__min_impurity_decrease', 'decision_tree_classifier__min_impurity_split', 'decision_tree_classifier__min_samples_leaf', 'decision_tree_classifier__min_samples_split', 'decision_tree_classifier__min_weight_fraction_leaf', 'decision_tree_classifier__presort', 'decision_tree_classifier__random_state', 'decision_tree_classifier__splitter'])

In [256]:

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'decision_tree_classifier__max_depth':[2,4,6,8,10,12],
              'decision_tree_classifier__min_samples_leaf':[2,4,6,8,10,12], 
              'decision_tree_classifier__min_samples_split':[2,4,6,8,10,12],
              'decision_tree_classifier__criterion':['gini','entropy'],
             }

grid_search_tree = GridSearchCV(estimator=tree_clf,
                           param_grid=parameters,
                           scoring = 'accuracy',
                           cv=10,
                           verbose=3,
                           refit=True, 
                           n_jobs=-1)

In [257]:

grid_search_tree.fit(X=X_train,y=np.ravel(y_train))

Fitting 10 folds for each of 432 candidates, totalling 4320 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed:  8.4min finished

Out[257]:

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['LIMIT_BAL',
                                                                          'SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE',
                                                                          'AGE',
                                                                          'PAY_0',
                                                                          'PAY_2',
                                                                          'PAY_3',
                                                                          'PAY_4',
                                                                          'PAY_5',
                                                                          'PAY_6',
                                                                          'BILL_AMT1',
                                                                          'BILL_AMT2',
                                                                          'BILL_AMT3',
                                                                          'BILL_AMT4',
                                                                          'BILL_AMT5',
                                                                          'BILL_AMT6',
                                                                          'PAY_AMT1',
                                                                          'PAY_AMT2',
                                                                          'PAY_AMT3',
                                                                          'PA...
                                                                          'PAY_AMT6'])])),
                                       ('decision_tree_classifier',
                                        DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'decision_tree_classifier__criterion': ['gini',
                                                                 'entropy'],
                         'decision_tree_classifier__max_depth': [2, 4, 6, 8, 10,
                                                                 12],
                         'decision_tree_classifier__min_samples_leaf': [2, 4, 6,
                                                                        8, 10,
                                                                        12],
                         'decision_tree_classifier__min_samples_split': [2, 4,
                                                                         6, 8,
                                                                         10,
                                                                         12]},
             scoring='accuracy', verbose=3)

In [258]:

grid_search_tree.best_params_

Out[258]:

{'decision_tree_classifier__criterion': 'entropy',
 'decision_tree_classifier__max_depth': 4,
 'decision_tree_classifier__min_samples_leaf': 12,
 'decision_tree_classifier__min_samples_split': 2}

In [259]:

final_model_tree = grid_search_tree.best_estimator_

In [260]:

final_model_tree.fit(X=X_train,y=np.ravel(y_train))

Out[260]:

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['LIMIT_BAL', 'SEX',
                                                   'EDUCATION', 'MARRIAGE',
                                                   'AGE', 'PAY_0', 'PAY_2',
                                                   'PAY_3', 'PAY_4', 'PAY_5',
                                                   'PAY_6', 'BILL_AMT1',
                                                   'BILL_AMT2', 'BILL_AMT3',
                                                   'BILL_AMT4', 'BILL_AMT5',
                                                   'BILL_AMT6', 'PAY_AMT1',
                                                   'PAY_AMT2', 'PAY_AMT3',
                                                   'PAY_AMT4', 'PAY_AMT5',
                                                   'PAY_AMT6'])])),
                ('decision_tree_classifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=4,
                                        min_samples_leaf=12))])

In [261]:

final_model_tree.score(X_test, y_test)

Out[261]:

0.8321362953064975

In [262]:

y_pred = final_model_tree.predict(X_test)

Cross Validation¶

In [263]:

classifiers = [
    final_model_tree,
]
classifier_names = [
    'Decision Tree Classifier using StandardScaler',
]
for clf, clf_name in zip(classifiers, classifier_names):
    cv_scores = cross_val_score(clf, X_test, np.ravel(y_test), cv=100)
    
    print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')

Decision Tree Classifier using StandardScaler  mean accuracy:  82.875 % std:  0.151 %

Model Evaluation¶

In [264]:

score_keeper = logistic_scoring(y_test, y_pred, 'DecisionTreeClassifer after hyperparameter tuning', score_keeper)
score_keeper

Out[264]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08

In [265]:

print("Accuracy of DecisionTreeClassifier: {}%".format(round(accuracy_score(y_test, y_pred)*100, 2)))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, linewidths=1, cmap="Blues", fmt='g')
plt.xlabel('Predicted classes')
plt.ylabel('True Classes')
plt.show()
print(classification_report(y_test, y_pred,digits=6))

Accuracy of DecisionTreeClassifier: 83.21%

              precision    recall  f1-score   support

           0   0.846503  0.958681  0.899107      4671
           1   0.723099  0.382979  0.500745      1316

    accuracy                       0.832136      5987
   macro avg   0.784801  0.670830  0.699926      5987
weighted avg   0.819377  0.832136  0.811543      5987

In [266]:

from sklearn.metrics import plot_precision_recall_curve
disp = plot_precision_recall_curve(final_model_tree, X_test, y_test)
disp

Out[266]:

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x20f53ef1f70>

Question 2 (4 pts)¶

Try to build the decision tree which you built for the previous question, but this time by RandomizedSearchCV over hyper-parameters. Compare the results.

Answer:¶

The scores are exactly the same, the major difference being the timing. RandomzedGridSearchCV takes 9.9 seconds to complete versus 8.4 minutes for the standard GridSearchCV. So there is a major processing time differance for the same scoring results.

Randomized parameter search using RandomizedSearchCV¶

In [282]:

param_dist = {'decision_tree_classifier__max_depth':[2,4,6,8,10,12],
              'decision_tree_classifier__min_samples_leaf':[2,4,6,8,10,12], 
              'decision_tree_classifier__min_samples_split':[2,4,6,8,10,12],
              'decision_tree_classifier__criterion':['gini','entropy']
             }


clf = RandomizedSearchCV(tree_clf, param_dist, refit=True, n_jobs=-1, verbose=3, cv=10)

In [283]:

clf.fit(X=X_train,y=np.ravel(y_train))

Fitting 10 folds for each of 10 candidates, totalling 100 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.1s finished

Out[283]:

RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('scaler',
                                                                                                StandardScaler())]),
                                                                               ['LIMIT_BAL',
                                                                                'SEX',
                                                                                'EDUCATION',
                                                                                'MARRIAGE',
                                                                                'AGE',
                                                                                'PAY_0',
                                                                                'PAY_2',
                                                                                'PAY_3',
                                                                                'PAY_4',
                                                                                'PAY_5',
                                                                                'PAY_6',
                                                                                'BILL_AMT1',
                                                                                'BILL_AMT2',
                                                                                'BILL_AMT3',
                                                                                'BILL_AMT4',
                                                                                'BILL_AMT5',
                                                                                'BILL_AMT6',
                                                                                'PAY_AMT1',
                                                                                'PAY_AMT2',
                                                                                'PAY_AMT...
                                                                                'PAY_AMT5',
                                                                                'PAY_AMT6'])])),
                                             ('decision_tree_classifier',
                                              DecisionTreeClassifier())]),
                   n_jobs=-1,
                   param_distributions={'decision_tree_classifier__criterion': ['gini',
                                                                                'entropy'],
                                        'decision_tree_classifier__max_depth': [2,
                                                                                4,
                                                                                6,
                                                                                8,
                                                                                10,
                                                                                12],
                                        'decision_tree_classifier__min_samples_leaf': [2,
                                                                                       4,
                                                                                       6,
                                                                                       8,
                                                                                       10,
                                                                                       12],
                                        'decision_tree_classifier__min_samples_split': [2,
                                                                                        4,
                                                                                        6,
                                                                                        8,
                                                                                        10,
                                                                                        12]},
                   verbose=3)

In [284]:

clf.best_params_

Out[284]:

{'decision_tree_classifier__min_samples_split': 6,
 'decision_tree_classifier__min_samples_leaf': 4,
 'decision_tree_classifier__max_depth': 4,
 'decision_tree_classifier__criterion': 'entropy'}

In [285]:

final_model_rand = clf.best_estimator_
rand_predict = final_model_rand.predict(X_test)

RandomizedGridSearchCV Scoring¶

In [286]:

score_keeper = logistic_scoring(y_test, rand_predict, 'DecisionTreeClassifer using RandomGridSearchCV', score_keeper)
score_keeper

Out[286]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17

Question 3 (6 pts)¶

Part 1: Try to build the same classifier by using following ensemble models.

Random Forest
AdaBoost
Extra Trees Classifier
Gradient Boosted Trees

Part 2: For each of these models calculate accuracy and at least for two in the list below, plot the learning curves.

In [333]:

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [334]:

credit_data.shape

Out[334]:

(29932, 24)

In [335]:

X, y = credit_data[features].values, credit_data[['next_pmt_default']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
X_train

Out[335]:

array([[200000,      1,      1, ...,      0,      0,      0],
       [210000,      2,      2, ...,   2500,   3000,   2500],
       [190000,      1,      1, ...,   1003,   2007,   3019],
       ...,
       [130000,      2,      3, ...,   5000,   4614,   5139],
       [200000,      2,      3, ...,   5497,  11378,  12774],
       [200000,      2,      1, ...,   3000,   3233,   3000]], dtype=int64)

DecisionTreeClassifier with StandardScaler¶

In [290]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
dtc = Pipeline(steps=[('scaler', numeric_transformer), ('dtc_classifier', dtc(max_depth=10))])

In [291]:

dtc.fit(X_train, np.ravel(y_train))
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8196091531651912

In [292]:

score_keeper = logistic_scoring(y_test, y_pred, 'DecisionTreeClassifer with StandardScaler', score_keeper)
score_keeper

Out[292]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66

RandomForestClassifier with StandardScaler¶

In [296]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
rfc = Pipeline(steps=[('scaler', numeric_transformer), ('RandomForestClassifier', RandomForestClassifier())])

In [297]:

rfc.fit(X_train, np.ravel(y_train))
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8257892099549022

In [298]:

score_keeper = logistic_scoring(y_test, y_pred, 'RandomForestClassifer with StandardScaler', score_keeper)
score_keeper

Out[298]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11

Bagging Classifier with decisionTreeClassifer and StandardScaler¶

In [302]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
bag_clf = Pipeline(steps=[('scaler', numeric_transformer),
                          ('bag_classifier', BaggingClassifier(base_estimator=dtc(max_depth=10),\
                                                               n_estimators=200, max_samples=90, bootstrap=True, n_jobs=-1))])

In [303]:

bag_clf.fit(X_train, np.ravel(y_train))
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8262902956405546

In [304]:

score_keeper = logistic_scoring(y_test, y_pred, 'Bagging Classifier with decisionTreeClassifer StandardScaler', score_keeper)
score_keeper

Out[304]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45

Bagging Classifier with RandomForestClassifier and StandardScaler¶

In [308]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
bag_rfc_clf = Pipeline(steps=[('scaler_rfc', numeric_transformer), 
                              ('bag_classifier_rfc', BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1),\
                                                                       n_estimators=200, max_samples=90, bootstrap=True, n_jobs=-1))])

In [309]:

bag_rfc_clf.fit(X_train, np.ravel(y_train))
y_pred = bag_rfc_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8107566393853349

In [310]:

score_keeper = logistic_scoring(y_test, y_pred, 'Bagging Classifier with RandomForestClassifer StandardScaler', score_keeper)
score_keeper

Out[310]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45
6	Bagging Classifier with RandomForestClassifer ...	81.08	74.53	21.12	32.92	59.55

Adaboost Classifier with DecisionTreeClassifier and StandardScaler¶

In [314]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
ada_clf = Pipeline(steps=[('scaler', numeric_transformer),
                          ('adaboost_classifier', AdaBoostClassifier(dtc(max_depth=10),n_estimators=200, algorithm="SAMME.R", learning_rate=0.9))])

In [315]:

ada_clf.fit(X_train, np.ravel(y_train))
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8019041256054785

In [316]:

score_keeper = logistic_scoring(y_test, y_pred, 'Adaboost Classifier with DecisionTreeClassifier StandardScaler', score_keeper)
score_keeper

Out[316]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45
6	Bagging Classifier with RandomForestClassifer ...	81.08	74.53	21.12	32.92	59.55
7	Adaboost Classifier with DecisionTreeClassifie...	80.19	58.33	34.57	43.42	63.81

Adaboost Classifier with RandomForestClassifier and StandardScaler¶

In [320]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
ada_rfc_clf = Pipeline(steps=[('scaler', numeric_transformer),('adaboost_classifier', AdaBoostClassifier(RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1),n_estimators=200, algorithm="SAMME.R", learning_rate=0.9))])

In [321]:

ada_rfc_clf.fit(X_train, np.ravel(y_train))
y_pred = ada_rfc_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8229497244028728

In [322]:

score_keeper = logistic_scoring(y_test, y_pred, 'AdaboostClassifier with RandomForestClassifer StandardScaler', score_keeper)
score_keeper

Out[322]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45
6	Bagging Classifier with RandomForestClassifer ...	81.08	74.53	21.12	32.92	59.55
7	Adaboost Classifier with DecisionTreeClassifie...	80.19	58.33	34.57	43.42	63.81
8	AdaboostClassifier with RandomForestClassifer ...	82.29	67.58	37.39	48.14	66.17

Extratrees Classifier alone no pipeline or transformers¶

In [326]:

forest = ExtraTreesClassifier(n_estimators=10)
forest.fit(X_train, np.ravel(y_train))
y_pred = forest.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8182729246701186

In [327]:

score_keeper = logistic_scoring(y_test, y_pred, 'ExtraTreesClassifier alone no transformers', score_keeper)
score_keeper

Out[327]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45
6	Bagging Classifier with RandomForestClassifer ...	81.08	74.53	21.12	32.92	59.55
7	Adaboost Classifier with DecisionTreeClassifie...	80.19	58.33	34.57	43.42	63.81
8	AdaboostClassifier with RandomForestClassifer ...	82.29	67.58	37.39	48.14	66.17
9	ExtraTreesClassifier alone no transformers	81.83	65.79	36.09	46.61	65.4

RandomForestClassifer alone no pipeline or transformers¶

In [331]:

rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, np.ravel(y_train))
y_pred = rnd_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8259562385167863

In [332]:

score_keeper = logistic_scoring(y_test, y_pred, 'RandomForestClassifier alone no transformers', score_keeper)
score_keeper

Out[332]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45
6	Bagging Classifier with RandomForestClassifer ...	81.08	74.53	21.12	32.92	59.55
7	Adaboost Classifier with DecisionTreeClassifie...	80.19	58.33	34.57	43.42	63.81
8	AdaboostClassifier with RandomForestClassifer ...	82.29	67.58	37.39	48.14	66.17
9	ExtraTreesClassifier alone no transformers	81.83	65.79	36.09	46.61	65.4
10	RandomForestClassifier alone no transformers	82.6	73.22	32.83	45.33	64.72

GradientBoostingClassifier with StandardScaler¶

In [336]:

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
gbc = Pipeline(steps=[('scaler', numeric_transformer), ('GradientBoostingClassifier',
                                                        GradientBoostingClassifier(n_estimators=200, learning_rate=1, max_features=10, max_depth=10, random_state=42))])

In [337]:

gbc.fit(X_train, np.ravel(y_train))
y_pred = gbc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8039084683480875

In [338]:

score_keeper = logistic_scoring(y_test, y_pred, 'GradientBoosting w StandardScaler', score_keeper)

Question 3 Part 1:¶

Answer:¶

See below for all specified models scoring metrics

In [340]:

score_keeper

Out[340]:

	model	accuracy	precision	recall	F1	ROC
0	DecisionTreeClassifer before hyperparameter tu...	73.81	40.9	43.01	41.93	62.75
1	DecisionTreeClassifer after hyperparameter tuning	83.21	72.31	38.3	50.07	67.08
2	DecisionTreeClassifer using RandomGridSearchCV	83.13	71.49	38.68	50.2	67.17
3	DecisionTreeClassifer with StandardScaler	81.96	64.75	39.36	48.96	66.66
4	RandomForestClassifer with StandardScaler	82.58	67.8	39.51	49.93	67.11
5	Bagging Classifier with decisionTreeClassifer ...	82.63	71.56	34.8	46.83	65.45
6	Bagging Classifier with RandomForestClassifer ...	81.08	74.53	21.12	32.92	59.55
7	Adaboost Classifier with DecisionTreeClassifie...	80.19	58.33	34.57	43.42	63.81
8	AdaboostClassifier with RandomForestClassifer ...	82.29	67.58	37.39	48.14	66.17
9	ExtraTreesClassifier alone no transformers	81.83	65.79	36.09	46.61	65.4
10	RandomForestClassifier alone no transformers	82.6	73.22	32.83	45.33	64.72
11	GradientBoosting w StandardScaler	80.39	57.89	39.59	47.02	65.74

Question 3 Part 2:¶

Answer:¶

See below for learning curves for the two models with some of the highest scores:

Bagging classifier with DecisionTreeClassifier and StandardScaler: 82.63
RandomForestClassifier with StandardScaler: 82.58

In [342]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [343]:

X, y = credit_data[features].values, credit_data[['next_pmt_default']].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [268]:

# 1)
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
bag_pipe = Pipeline(steps=[('scaler', numeric_transformer),
                          ('bag_classifier', BaggingClassifier(base_estimator=dtc(max_depth=10),\
                                                               n_estimators=200, max_samples=90, bootstrap=True, n_jobs=-1))])

In [269]:

# 2) 
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
rfc_pipe = Pipeline(steps=[('scaler', numeric_transformer), ('RandomForestClassifier', RandomForestClassifier())])

In [270]:

# Plot Learning Curve Function Courtesy of the SKLEARN Documentation def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): if axes is None: _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title(title) if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = \ learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes[0].legend(loc="best") # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, 'o-') axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() axes[2].plot(fit_times_mean, test_scores_mean, 'o-') axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") return plt fig, axes = plt.subplots(3, 2, figsize=(10, 15)) #X, y = load_digits(return_X_y=True) #BaggingClassifier with DescisionTreeClassifier and StandardScaler Pipeline title = "Learning Curves (BaggingClassifier w DecisionTree)" cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42) estimator = bag_pipe plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=-1) #RandomForestClassifier and StandardScaler Pipeline title = r"Learning Curves (RandomForestClassifier)" cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42) estimator = rfc_pipe plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=-1) plt.show()

Question 4 (3 pts)¶

Discuss and compare the results for the all past three questions.

How does changing hyperparams effect model performance?
Why do you think certain models performed better/worse?
How does this performance line up with known strengths/weakness of these models?

Question 4 Answer:¶

Part 1: Changing the hyperparameters will usually lead to increased scores and increased processing times. In question 1 doing a hyperparameter search increased the accuracy score of the DecisionTreeClassifier by 10%, it went from 73.18 to 83.21. The Randomized grid search is significantly faster then the standard gridsearch with very similiar scoring.
Part 2: Certain models perform better or worse depending on the data fed into them. Preprocessing data properly is very important, removing zero values, proper scaling, and normalization all contribute to these models succesful predictions. Some models are more sensitive to having outliers, some are more sensitive to non normalised data etc. Building proper data transformation pipelines tailored to each features data type is crucial.
Part 3: The random forest classifier is know for performing better then some of the classical algorithms and didnt disappoint in these cases, as it had one of the highest scores without any data transformations, boosters, or baggers. The decision tree classifier performed very poorly without hyperparameter tuning so that would imply it is very hyper parameter tuning dependant.

Tree Graph Showing the Leafs - for fun¶

In [62]:

X, y = credit_data[features], credit_data[['next_pmt_default']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [67]:

dtc_clf = dtc(max_depth=3, criterion='entropy') dtc_clf.fit(X, y)

Out[67]:

DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [68]:

tree.plot_tree(dtc_clf);

In [149]:

#dot_data = export_graphviz(tree_clf, filled=True, rounded=True, class_names=[],feature_names=feat_cols, out_file=None)

In [69]:

graph = Source(tree.export_graphviz(dtc_clf, filled = True)) display(SVG(graph.pipe(format='svg')))

In [ ]:

Project # 4 - Decision Trees, Ensemble Methods and Grid Searches¶

Justin Gabb¶

Questions (15 points total)¶

Question 1 (2 pts)¶

Answer - See Below:¶

Data Preperation and Cleaning¶

Data Splitting and Assignment¶

Pipeline for data transformation using standard scaler, pandas column transformer, and DecisionTreeClassifier¶

Model training and fitting¶

Cross Validation¶

Model Evaluation¶

Question 2 (4 pts)¶

Answer:¶

Randomized parameter search using RandomizedSearchCV¶

RandomizedGridSearchCV Scoring¶

Question 3 (6 pts)¶

DecisionTreeClassifier with StandardScaler¶

RandomForestClassifier with StandardScaler¶

Bagging Classifier with decisionTreeClassifer and StandardScaler¶

Bagging Classifier with RandomForestClassifier and StandardScaler¶

Adaboost Classifier with DecisionTreeClassifier and StandardScaler¶

Adaboost Classifier with RandomForestClassifier and StandardScaler¶

Extratrees Classifier alone no pipeline or transformers¶

RandomForestClassifer alone no pipeline or transformers¶

GradientBoostingClassifier with StandardScaler¶

Question 3 Part 1:¶

Answer:¶

Question 3 Part 2:¶

Answer:¶

Question 4 (3 pts)¶

Question 4 Answer:¶

Tree Graph Showing the Leafs - for fun¶

Links

Portfolio

Contact