import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Klassifikation\Klassifikation_West Nile Virus')
import plotly_express as px
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
from IPython.core.pylabtools import figsize
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, classification_report, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv('combined.csv')
df.head()
x = df.drop('WnvPresent', axis=1)
y = df.WnvPresent
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
sm = SMOTE()
X_train_s, y_train_s = sm.fit_sample(X_train, y_train)
print(y_train.value_counts())
print(np.bincount(y_train_s))
sc = StandardScaler()
sc.fit(X_train_s)
X_train_s = sc.transform(X_train_s)
X_tests = sc.transform(X_test)
Vorhersage der häufigeren Klasse. Wenn immer 0 vorhergesagt wird, ist die Accuracy bei fast 95%
1-((df.WnvPresent).mean())
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_s, y_train_s)
y_pred = rf.predict(X_tests)
y_pred_proba = rf.predict_proba(X_tests)
accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)
rf.fit(X_train, y_train)
y_pred1 = rf.predict(X_test)
y_pred_proba1 = rf.predict_proba(X_test)
accuracy_score(y_test, y_pred1)
confusion_matrix(y_test, y_pred1)
Das Ergebnis war vorherzusehen, da das Ensemble aus Decision Trees aufgrund der Art des Lernens im Allgemeinen kein Oversampling und kein Skalieren braucht. Weiterhin wird der Original-Datensatz verwendet.
# Hyperparameter des Random Forest Klassifizierers
rf.get_params()
# Hyperparameter grid erstellen
param_grid = {
'n_estimators': np.linspace(100, 150).astype(int),
'max_depth': [None] + list(np.linspace(5, 30).astype(int)),
'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
'min_samples_split': [2, 5, 10]
}
# Random Forest
estimator = RandomForestClassifier(random_state = 42)
# Das Random Search Model
random = RandomizedSearchCV(estimator, param_grid, n_jobs = -1,
scoring = 'neg_mean_absolute_error', cv = 3,
n_iter = 100, verbose = 0, random_state=42)
#print(random)
random.fit(X_train, y_train)
model = random.best_estimator_
#model
model.n_jobs = -1
model.fit(X_train, y_train)
test_preds = model.predict(X_test)
pred_proba = model.predict_proba(X_test)[:,1]
pred_proba
accuracy_score(y_test, test_preds)
confusion_matrix(y_test, test_preds)
Ein Hauch besser als die Baseline
from sklearn.preprocessing import binarize
# Threshold von 0.4
y_pred_ = model.predict_proba(X_test)
y_pred_class = binarize(y_pred_, 0.4)[:,1]
accuracy_score(y_test, y_pred_class)
confusion_matrix(y_test, y_pred_class)
Es kann sinnvoll sein, den Threshold zu verringern, um überhaupt positive Fälle zu finden.
from sklearn.preprocessing import binarize
# Threshold von 0.6
y_pred_ = model.predict_proba(X_test)
y_pred_class = binarize(y_pred_, 0.6)[:,1]
accuracy_score(y_test, y_pred_class)
confusion_matrix(y_test, y_pred_class)
fi = pd.Series(data=model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.rcParams['font.size'] = 15
plt.figure(figsize=(20,18))
plt.title("Feature Importances", fontsize = 30)
plt.ylabel('Variable', fontsize = 25);
plt.xlabel('Importance', fontsize = 25);
sns.barplot(y=fi.index, x=fi.values, palette="Blues_d", orient='h');