Gradient Boosting Classifier

import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np
import collections

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
import seaborn as sns

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

from warnings import filterwarnings
filterwarnings('ignore')

import os
os.chdir('D:\Data\Projects\Klassifikation\Diabetes')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, auc, roc_curve, make_scorer, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier

# load dataset
df = pd.read_csv('diabetes_clean.csv')
print(df.shape)
df.head()

(768, 9)

Train Test Split¶

train = df.drop('Outcome', axis=1)
labels = df.Outcome

x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((576, 8), (192, 8), (576,), (192,))

Baseline¶

Um einen sinnvollen Vergleich der Algorithmen anstellen zu können, ist es wichtig, eine Baseline zu haben. Mit Hilfe der Baseline kann abgeschätzt werden, ob ein Algorithmus Ergebnisse bringt, die eine Verbesserung bedeuten. Hier wird für die Baseline der Dummy Classifier aus Sklearn verwendet.

#Macht Vorhersagen basierend auf einfachen Regeln. 
dummy = DummyClassifier(strategy='most_frequent', random_state= 1)

# Model trainieren
dummy.fit(x_train, y_train)

# Vorhersage
dummpred = dummy.predict(x_test)
print(classification_report(y_test, dummpred))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78       123
           1       0.00      0.00      0.00        69

    accuracy                           0.64       192
   macro avg       0.32      0.50      0.39       192
weighted avg       0.41      0.64      0.50       192

Gradient Boosting Classifier Baseline¶

Fitten mit den default Hyperparametern

gb = GradientBoostingClassifier(random_state = 42)
gb.fit(x_train, y_train)
y_predb = gb.predict(x_test)
print(accuracy_score(y_test, y_predb))
print(classification_report(y_test, y_predb))

0.7395833333333334
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       123
           1       0.62      0.71      0.66        69

    accuracy                           0.74       192
   macro avg       0.72      0.73      0.73       192
weighted avg       0.75      0.74      0.74       192

y_scores_gb = gb.decision_function(x_train)
fpr_gb, tpr_gb, _ = roc_curve(y_train, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)

print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))

Area under ROC curve = 0.99

GridSearch für beste Hyperparameter¶

Für das Hyperparameter Tuning werden zwei Tools aus der Scikit-Learn Bibliothek verwendet, Grid Search und Cross Validation. Grid Search wird verwendet, um mögliche Kombinationen von Hyperparametern im Model zu testen. Dafür wird zunächst ein Grid von Parametern, die für den vorliegenden Fall als wichtig erachtet werden, definiert. Nun werden in einer Kreuzvalidierung alle möglichen Kombinationen evaluiert. Vorgehen

Aufbau des Hyperparameter Grids
Erstellen des GridSearchCV Objektes
Durchführen der Suche mit 10 - fold Kreuzvalidierung

parameters = {'learning_rate':[0.1, 0.05, 0.01, 0.005], 'n_estimators':[100, 500, 750], 
              'max_depth': [3, 5, 8], 'max_features': ['sqrt', None], 
              'subsample': [1.0, 0.8, 0.5], 'random_state': [42]}

kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'weighted')

grid_search = GridSearchCV(GradientBoostingClassifier(), parameters, cv=kfold, n_jobs=-1, 
                           scoring= scorer, verbose=2)
%time grid_search.fit(x_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)

Das Model mit den besten Parametern testen¶

gbc = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 5, random_state=42,
                                 max_features = 'sqrt', n_estimators = 500, subsample = 0.5)

gbc.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=5,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=0.5, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

y_pred = gbc.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       123
           1       0.64      0.70      0.67        69

    accuracy                           0.75       192
   macro avg       0.73      0.74      0.73       192
weighted avg       0.76      0.75      0.75       192

Das Model mit Datensatz 2 testen¶

Datensatz 2: Im Feature Engineering wurden 4 Features entfernt.

x_train_sel = x_train.drop(['Insulin', 'BloodPressure', 'DiabetesPedigreeFunction', 'SkinThickness'], axis=1)
x_train_sel.head()

x_test_sel = x_test.drop(['Insulin', 'BloodPressure', 'DiabetesPedigreeFunction', 'SkinThickness'], axis=1)

gbc.fit(x_train_sel, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=5,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=0.5, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

y_pred_sel = gbc.predict(x_test_sel)

print(classification_report(y_test, y_pred_sel))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       123
           1       0.61      0.68      0.64        69

    accuracy                           0.73       192
   macro avg       0.71      0.72      0.71       192
weighted avg       0.74      0.73      0.73       192

Model mit Recursive Feature Elimination¶

from sklearn.feature_selection import RFECV

rfecv = RFECV(estimator=gbc, step=1, cv=kfold, scoring=scorer)
rfecv.fit(x_train, y_train)

RFECV(cv=StratifiedKFold(n_splits=10, random_state=22, shuffle=False),
      estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                           learning_rate=0.01, loss='deviance',
                                           max_depth=5, max_features='sqrt',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=500,
                                           n_iter_no_change=None,
                                           presort='auto', random_state=42,
                                           subsample=0.5, tol=0.0001,
                                           validation_fraction=0.1, verbose=0,
                                           warm_start=False),
      min_features_to_select=1, n_jobs=None,
      scoring=make_scorer(f1_score, average=weighted), step=1, verbose=0)

plt.plot(rfecv.grid_scores_);
plt.xlabel('Number of Features'); 
plt.ylabel('Micro F1 Score'); 
plt.title('Feature Selection Scores');
print('Number of Features selected: ', rfecv.n_features_)

Number of Features selected:  6

rankings = pd.DataFrame({'feature': list(x_train.columns), 
                         'rank': list(rfecv.ranking_)}).sort_values('rank')
rankings

train_selected = rfecv.transform(x_train)
test_selected = rfecv.transform(x_test)

selected_features = x_train.columns[np.where(rfecv.ranking_==1)]
train_selected = pd.DataFrame(train_selected, columns = selected_features)
test_selected = pd.DataFrame(test_selected, columns = selected_features)
train_selected.head()

gbc.fit(train_selected, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=5,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=0.5, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

Vorhersage mit Datensatz aus RFE¶

y_pred_selected = gbc.predict(test_selected)
print(classification_report(y_test, y_pred_selected))

              precision    recall  f1-score   support

           0       0.81      0.78      0.80       123
           1       0.64      0.68      0.66        69

    accuracy                           0.74       192
   macro avg       0.72      0.73      0.73       192
weighted avg       0.75      0.74      0.75       192

Confusion Matrix¶

cm = confusion_matrix(y_test, y_pred_selected)
plt.rcParams['font.size'] = 20
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Set2', fmt=".1f"); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize=18);
ax.set_ylabel('True labels', fontsize=18); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['0', '1']); 
ax.yaxis.set_ticklabels(['0', '1']);
plt.tight_layout()

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148.000	72.000	35.000	30.500	33.600	0.627	50	1
1	1	85.000	66.000	29.000	30.500	26.600	0.351	31	0
2	8	183.000	64.000	23.000	30.500	23.300	0.672	32	1
3	1	89.000	66.000	23.000	94.000	28.100	0.167	21	0
4	0	137.000	40.000	35.000	168.000	43.100	2.288	33	1

	Pregnancies	Glucose	BMI	Age
357	13	129.000	39.900	44
73	4	129.000	35.100	23
352	3	61.000	34.400	46
497	2	81.000	30.100	25
145	0	102.000	32.000	21

	Pregnancies	Glucose	Insulin	BMI	DiabetesPedigreeFunction	Age
0	13.000	129.000	30.500	39.900	0.569	44.000
1	4.000	129.000	270.000	35.100	0.231	23.000
2	3.000	61.000	30.500	34.400	0.243	46.000
3	2.000	81.000	76.000	30.100	0.547	25.000
4	0.000	102.000	30.500	32.000	0.572	21.000