In [1]:
import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np
import collections

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
import seaborn as sns

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

from warnings import filterwarnings
filterwarnings('ignore')

import os
os.chdir('D:\Data\Projects\Klassifikation\Diabetes')
In [2]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

# ML 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\externals\six.py:31: DeprecationWarning:

The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).

In [3]:
# load dataset
df = pd.read_csv('diabetes_clean.csv')
print(df.shape)
df.head()
(768, 9)
Out[3]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148.000 72.000 35.000 30.500 33.600 0.627 50 1
1 1 85.000 66.000 29.000 30.500 26.600 0.351 31 0
2 8 183.000 64.000 23.000 30.500 23.300 0.672 32 1
3 1 89.000 66.000 23.000 94.000 28.100 0.167 21 0
4 0 137.000 40.000 35.000 168.000 43.100 2.288 33 1

Train Test Split

In [4]:
train = df.drop('Outcome', axis=1)
labels = df.Outcome
In [5]:
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
Out[5]:
((576, 8), (192, 8), (576,), (192,))

Oversampling mit SMOTE

In [6]:
sm = SMOTE(random_state = 12)

x_trains, y_trains = sm.fit_sample(x_train, y_train)

print(y_train.value_counts())
print(collections.Counter(y_trains))
0    377
1    199
Name: Outcome, dtype: int64
Counter({1: 377, 0: 377})

Skalieren mit StandarScaler

In [8]:
# Scaler Object erstellen
sc = StandardScaler()

# Fitting auf den Trainingsdaten
sc.fit(x_trains)

# Transformieren der Trainings- und Testdaten
X_trains = sc.transform(x_trains)
X_tests = sc.transform(x_test)
In [26]:
# Skalieren des nicht oversampleten Satzes
sc = StandardScaler()
sc.fit(x_train)
X_train =sc.transform(x_train);
X_test =sc.transform(x_test);

Auswahl des Algorithmus

In [17]:
# Dataframe 
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

# Falls ich ein Boxplot machen will
results = []
names = []

# Liste der Algorithmen
models = [RandomForestClassifier(),
          LogisticRegression(),
          KNeighborsClassifier(),
          SVC(),
          GradientBoostingClassifier()]

# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')

# Jedes Model in Cross Validation testen
for model in models:
    model_name = model.__class__.__name__
    cv_scores = model_selection.cross_val_score(model, X_trains, y_trains, 
                                                cv = kfold, scoring=scorer, n_jobs = -1)
    
    # Falls ich ein Boxplot machen will
    results.append(cv_scores)
    names.append(model_name)
    
    # DataFrame mit Ergebnissen füllen
    model_results = model_results.append(pd.DataFrame({'model': model_name, 
                                                       'cv_mean': cv_scores.mean(), 
                                                       'cv_std': cv_scores.std()},
                                                        index = [0]), 
                                                        ignore_index = True)
    
    model_results = model_results.sort_values('cv_mean', ascending=False)


model_results
Out[17]:
model cv_mean cv_std
4 GradientBoostingClassifier 0.824 0.050
0 SVC 0.816 0.046
1 KNeighborsClassifier 0.808 0.053
2 RandomForestClassifier 0.793 0.076
3 LogisticRegression 0.763 0.054
In [28]:
# Boxplot
fig = plt.figure(figsize=(10,10))
fig.suptitle('Vergleich der Algorithmen')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

Test ohne Oversampling

In [27]:
# Dataframe machen
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

# Falls ich ein Boxplot machen will
results = []
names = []

# Liste der Algorithmen
models = [RandomForestClassifier(),
          LogisticRegression(),
          KNeighborsClassifier(),
          SVC(),
          GradientBoostingClassifier()]

# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')

# Jedes Model in Cross Validation testen
for model in models:
    model_name = model.__class__.__name__
    cv_scores = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring=scorer, n_jobs = -1)
    
    # Falls ich ein Boxplot machen will
    results.append(cv_scores)
    names.append(model_name)
    
    # DataFrame mit Ergebnissen füllen
    model_results = model_results.append(pd.DataFrame({'model': model_name, 
                                                       'cv_mean': cv_scores.mean(), 
                                                       'cv_std': cv_scores.std()},
                                                        index = [0]), 
                                                        ignore_index = True)
    
    model_results = model_results.sort_values('cv_mean', ascending=False)


model_results
Out[27]:
model cv_mean cv_std
0 LogisticRegression 0.772 0.051
4 GradientBoostingClassifier 0.771 0.036
1 KNeighborsClassifier 0.762 0.049
2 SVC 0.759 0.046
3 RandomForestClassifier 0.748 0.053
In [ ]: