In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('Solarize_Light2')
import seaborn as sns
import altair as al

import os

pd.set_option('display.max_columns', None)
os.chdir('D:\Data\Projects\Klassifikation\mushrooms')
In [3]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

# ML 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
In [4]:
df = pd.read_csv('df_clean.csv')
In [5]:
x = df.drop('target', axis=1)
y = df.target
In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[6]:
((5686, 89), (2438, 89), (5686,), (2438,))
In [7]:
# Dataframe machen
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

# Falls ich ein Boxplot machen will
results = []
names = []

# Liste der Algorithmen
models = [RandomForestClassifier(),
          LogisticRegression(),
          KNeighborsClassifier(),
          SVC(),
          GradientBoostingClassifier()]

# Parameter Cross Validation
kfold = StratifiedKFold(n_splits=3, random_state=42)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')

# Jedes Model in Cross Validation testen
for model in models:
    model_name = model.__class__.__name__
    cv_scores = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring=scorer, n_jobs = -1)
    
    # Falls ich ein Boxplot machen will
    results.append(cv_scores)
    names.append(model_name)
    
    # DataFrame mit Ergebnissen füllen
    model_results = model_results.append(pd.DataFrame({'model': model_name, 
                                                       'cv_mean': cv_scores.mean(), 
                                                       'cv_std': cv_scores.std()},
                                                        index = [0]), 
                                                        ignore_index = True)
    
    model_results = model_results.sort_values('cv_mean', ascending=False)


model_results
Out[7]:
model cv_mean cv_std
4 GradientBoostingClassifier 0.999472 0.000747
0 RandomForestClassifier 0.999297 0.000995
1 LogisticRegression 0.988042 0.003021
2 SVC 0.806893 0.003483
3 KNeighborsClassifier 0.794581 0.007642
In [8]:
# Boxplot machen aus results und names
fig = plt.figure(figsize=(15,15))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()