Gradient Boosting Classifier
import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import collections
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
import seaborn as sns
import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
from warnings import filterwarnings
filterwarnings('ignore')
import os
os.chdir('D:\Data\Projects\Klassifikation\Diabetes')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, auc, roc_curve, make_scorer, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
# load dataset
df = pd.read_csv('diabetes_clean.csv')
print(df.shape)
df.head()
train = df.drop('Outcome', axis=1)
labels = df.Outcome
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
Um einen sinnvollen Vergleich der Algorithmen anstellen zu können, ist es wichtig, eine Baseline zu haben. Mit Hilfe der Baseline kann abgeschätzt werden, ob ein Algorithmus Ergebnisse bringt, die eine Verbesserung bedeuten. Hier wird für die Baseline der Dummy Classifier aus Sklearn verwendet.
#Macht Vorhersagen basierend auf einfachen Regeln.
dummy = DummyClassifier(strategy='most_frequent', random_state= 1)
# Model trainieren
dummy.fit(x_train, y_train)
# Vorhersage
dummpred = dummy.predict(x_test)
print(classification_report(y_test, dummpred))
Fitten mit den default Hyperparametern
gb = GradientBoostingClassifier(random_state = 42)
gb.fit(x_train, y_train)
y_predb = gb.predict(x_test)
print(accuracy_score(y_test, y_predb))
print(classification_report(y_test, y_predb))
y_scores_gb = gb.decision_function(x_train)
fpr_gb, tpr_gb, _ = roc_curve(y_train, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)
print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))
Für das Hyperparameter Tuning werden zwei Tools aus der Scikit-Learn Bibliothek verwendet, Grid Search und Cross Validation. Grid Search wird verwendet, um mögliche Kombinationen von Hyperparametern im Model zu testen. Dafür wird zunächst ein Grid von Parametern, die für den vorliegenden Fall als wichtig erachtet werden, definiert. Nun werden in einer Kreuzvalidierung alle möglichen Kombinationen evaluiert. Vorgehen
parameters = {'learning_rate':[0.1, 0.05, 0.01, 0.005], 'n_estimators':[100, 500, 750],
'max_depth': [3, 5, 8], 'max_features': ['sqrt', None],
'subsample': [1.0, 0.8, 0.5], 'random_state': [42]}
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'weighted')
grid_search = GridSearchCV(GradientBoostingClassifier(), parameters, cv=kfold, n_jobs=-1,
scoring= scorer, verbose=2)
%time grid_search.fit(x_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)
gbc = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 5, random_state=42,
max_features = 'sqrt', n_estimators = 500, subsample = 0.5)
gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
print(classification_report(y_test, y_pred))
Datensatz 2: Im Feature Engineering wurden 4 Features entfernt.
x_train_sel = x_train.drop(['Insulin', 'BloodPressure', 'DiabetesPedigreeFunction', 'SkinThickness'], axis=1)
x_train_sel.head()
x_test_sel = x_test.drop(['Insulin', 'BloodPressure', 'DiabetesPedigreeFunction', 'SkinThickness'], axis=1)
gbc.fit(x_train_sel, y_train)
y_pred_sel = gbc.predict(x_test_sel)
print(classification_report(y_test, y_pred_sel))
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=gbc, step=1, cv=kfold, scoring=scorer)
rfecv.fit(x_train, y_train)
plt.plot(rfecv.grid_scores_);
plt.xlabel('Number of Features');
plt.ylabel('Micro F1 Score');
plt.title('Feature Selection Scores');
print('Number of Features selected: ', rfecv.n_features_)
rankings = pd.DataFrame({'feature': list(x_train.columns),
'rank': list(rfecv.ranking_)}).sort_values('rank')
rankings
train_selected = rfecv.transform(x_train)
test_selected = rfecv.transform(x_test)
selected_features = x_train.columns[np.where(rfecv.ranking_==1)]
train_selected = pd.DataFrame(train_selected, columns = selected_features)
test_selected = pd.DataFrame(test_selected, columns = selected_features)
train_selected.head()
gbc.fit(train_selected, y_train)
y_pred_selected = gbc.predict(test_selected)
print(classification_report(y_test, y_pred_selected))
cm = confusion_matrix(y_test, y_pred_selected)
plt.rcParams['font.size'] = 20
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Set2', fmt=".1f"); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize=18);
ax.set_ylabel('True labels', fontsize=18);
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(['0', '1']);
ax.yaxis.set_ticklabels(['0', '1']);
plt.tight_layout()