import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np
import collections

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
import seaborn as sns

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

from warnings import filterwarnings
filterwarnings('ignore')

import os
os.chdir('D:\Data\Projects\Klassifikation\Diabetes')

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

# ML 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\externals\six.py:31: DeprecationWarning:

The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).

# load dataset
df = pd.read_csv('diabetes_clean.csv')
print(df.shape)
df.head()

(768, 9)

Train Test Split¶

train = df.drop('Outcome', axis=1)
labels = df.Outcome

x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((576, 8), (192, 8), (576,), (192,))

Oversampling mit SMOTE¶

sm = SMOTE(random_state = 12)

x_trains, y_trains = sm.fit_sample(x_train, y_train)

print(y_train.value_counts())
print(collections.Counter(y_trains))

0    377
1    199
Name: Outcome, dtype: int64
Counter({1: 377, 0: 377})

Skalieren mit StandarScaler¶

# Scaler Object erstellen
sc = StandardScaler()

# Fitting auf den Trainingsdaten
sc.fit(x_trains)

# Transformieren der Trainings- und Testdaten
X_trains = sc.transform(x_trains)
X_tests = sc.transform(x_test)

# Skalieren des nicht oversampleten Satzes
sc = StandardScaler()
sc.fit(x_train)
X_train =sc.transform(x_train);
X_test =sc.transform(x_test);

Auswahl des Algorithmus¶

# Dataframe 
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

# Falls ich ein Boxplot machen will
results = []
names = []

# Liste der Algorithmen
models = [RandomForestClassifier(),
          LogisticRegression(),
          KNeighborsClassifier(),
          SVC(),
          GradientBoostingClassifier()]

# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')

# Jedes Model in Cross Validation testen
for model in models:
    model_name = model.__class__.__name__
    cv_scores = model_selection.cross_val_score(model, X_trains, y_trains, 
                                                cv = kfold, scoring=scorer, n_jobs = -1)
    
    # Falls ich ein Boxplot machen will
    results.append(cv_scores)
    names.append(model_name)
    
    # DataFrame mit Ergebnissen füllen
    model_results = model_results.append(pd.DataFrame({'model': model_name, 
                                                       'cv_mean': cv_scores.mean(), 
                                                       'cv_std': cv_scores.std()},
                                                        index = [0]), 
                                                        ignore_index = True)
    
    model_results = model_results.sort_values('cv_mean', ascending=False)


model_results

# Boxplot
fig = plt.figure(figsize=(10,10))
fig.suptitle('Vergleich der Algorithmen')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

Test ohne Oversampling¶

# Dataframe machen
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

# Falls ich ein Boxplot machen will
results = []
names = []

# Liste der Algorithmen
models = [RandomForestClassifier(),
          LogisticRegression(),
          KNeighborsClassifier(),
          SVC(),
          GradientBoostingClassifier()]

# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')

# Jedes Model in Cross Validation testen
for model in models:
    model_name = model.__class__.__name__
    cv_scores = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring=scorer, n_jobs = -1)
    
    # Falls ich ein Boxplot machen will
    results.append(cv_scores)
    names.append(model_name)
    
    # DataFrame mit Ergebnissen füllen
    model_results = model_results.append(pd.DataFrame({'model': model_name, 
                                                       'cv_mean': cv_scores.mean(), 
                                                       'cv_std': cv_scores.std()},
                                                        index = [0]), 
                                                        ignore_index = True)
    
    model_results = model_results.sort_values('cv_mean', ascending=False)


model_results

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148.000	72.000	35.000	30.500	33.600	0.627	50	1
1	1	85.000	66.000	29.000	30.500	26.600	0.351	31	0
2	8	183.000	64.000	23.000	30.500	23.300	0.672	32	1
3	1	89.000	66.000	23.000	94.000	28.100	0.167	21	0
4	0	137.000	40.000	35.000	168.000	43.100	2.288	33	1

	model	cv_mean	cv_std
4	GradientBoostingClassifier	0.824	0.050
0	SVC	0.816	0.046
1	KNeighborsClassifier	0.808	0.053
2	RandomForestClassifier	0.793	0.076
3	LogisticRegression	0.763	0.054

	model	cv_mean	cv_std
0	LogisticRegression	0.772	0.051
4	GradientBoostingClassifier	0.771	0.036
1	KNeighborsClassifier	0.762	0.049
2	SVC	0.759	0.046
3	RandomForestClassifier	0.748	0.053