import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import collections
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
import seaborn as sns
import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
from warnings import filterwarnings
filterwarnings('ignore')
import os
os.chdir('D:\Data\Projects\Klassifikation\Diabetes')
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
# ML
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# load dataset
df = pd.read_csv('diabetes_clean.csv')
print(df.shape)
df.head()
train = df.drop('Outcome', axis=1)
labels = df.Outcome
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
sm = SMOTE(random_state = 12)
x_trains, y_trains = sm.fit_sample(x_train, y_train)
print(y_train.value_counts())
print(collections.Counter(y_trains))
# Scaler Object erstellen
sc = StandardScaler()
# Fitting auf den Trainingsdaten
sc.fit(x_trains)
# Transformieren der Trainings- und Testdaten
X_trains = sc.transform(x_trains)
X_tests = sc.transform(x_test)
# Skalieren des nicht oversampleten Satzes
sc = StandardScaler()
sc.fit(x_train)
X_train =sc.transform(x_train);
X_test =sc.transform(x_test);
# Dataframe
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])
# Falls ich ein Boxplot machen will
results = []
names = []
# Liste der Algorithmen
models = [RandomForestClassifier(),
LogisticRegression(),
KNeighborsClassifier(),
SVC(),
GradientBoostingClassifier()]
# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')
# Jedes Model in Cross Validation testen
for model in models:
model_name = model.__class__.__name__
cv_scores = model_selection.cross_val_score(model, X_trains, y_trains,
cv = kfold, scoring=scorer, n_jobs = -1)
# Falls ich ein Boxplot machen will
results.append(cv_scores)
names.append(model_name)
# DataFrame mit Ergebnissen füllen
model_results = model_results.append(pd.DataFrame({'model': model_name,
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std()},
index = [0]),
ignore_index = True)
model_results = model_results.sort_values('cv_mean', ascending=False)
model_results
# Boxplot
fig = plt.figure(figsize=(10,10))
fig.suptitle('Vergleich der Algorithmen')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Dataframe machen
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])
# Falls ich ein Boxplot machen will
results = []
names = []
# Liste der Algorithmen
models = [RandomForestClassifier(),
LogisticRegression(),
KNeighborsClassifier(),
SVC(),
GradientBoostingClassifier()]
# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=10, random_state=22)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')
# Jedes Model in Cross Validation testen
for model in models:
model_name = model.__class__.__name__
cv_scores = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring=scorer, n_jobs = -1)
# Falls ich ein Boxplot machen will
results.append(cv_scores)
names.append(model_name)
# DataFrame mit Ergebnissen füllen
model_results = model_results.append(pd.DataFrame({'model': model_name,
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std()},
index = [0]),
ignore_index = True)
model_results = model_results.sort_values('cv_mean', ascending=False)
model_results