In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('Solarize_Light2')
import seaborn as sns
import altair as al

import os

pd.set_option('display.max_columns', None)
os.chdir('D:\Data\Projects\Klassifikation\mushrooms')
In [3]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

from sklearn.ensemble import GradientBoostingClassifier

Feature Set 20 wichtigste Features

In [4]:
df = pd.read_csv('df_if.csv')
x = df.drop('target', axis=1)
y = df.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[4]:
((5686, 20), (2438, 20), (5686,), (2438,))

Gradient Boosting

In [6]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)
Out[6]:
0.9966015293118097

Feature Set alle Features

In [8]:
df1 = pd.read_csv('df_clean.csv')
x1 = df1.drop('target', axis=1)
y1 = df1.target
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size = 0.3, random_state=42)
X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape
Out[8]:
((5686, 89), (2438, 89), (5686,), (2438,))
In [9]:
model1 = GradientBoostingClassifier()
model1.fit(X_train1, y_train1)
y_pred1 = model1.predict(X_test1)
f1_score(y_test1, y_pred1)
Out[9]:
1.0