import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Klassifikation_West Nile Virus')

import plotly_express as px

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
from IPython.core.pylabtools import figsize

import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, classification_report, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\externals\six.py:31: DeprecationWarning:

The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).

Einlesen des kombinierten Datensatzes¶

df = pd.read_csv('combined.csv')
df.head()

Aufteilen des Datensatzes in Train und Test¶

x = df.drop('WnvPresent', axis=1)
y = df.WnvPresent

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8404, 42), (2102, 42), (8404,), (2102,))

Resampling und Skalieren¶

sm = SMOTE()

X_train_s, y_train_s = sm.fit_sample(X_train, y_train)

print(y_train.value_counts())
print(np.bincount(y_train_s))

0    7963
1     441
Name: WnvPresent, dtype: int64
[7963 7963]

sc = StandardScaler()
sc.fit(X_train_s)
X_train_s = sc.transform(X_train_s)
X_tests = sc.transform(X_test)

Baseline¶

Vorhersage der häufigeren Klasse. Wenn immer 0 vorhergesagt wird, ist die Accuracy bei fast 95%

1-((df.WnvPresent).mean())

0.9475537787930707

Random Forest Classifier Baseline Resampled und Skaliert¶

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_s, y_train_s)
y_pred = rf.predict(X_tests)
y_pred_proba = rf.predict_proba(X_tests)

accuracy_score(y_test, y_pred)

0.9333967649857279

confusion_matrix(y_test, y_pred)

array([[1940,   52],
       [  88,   22]], dtype=int64)

Random Forest Classifier Baseline Original Datensatz¶

rf.fit(X_train, y_train)
y_pred1 = rf.predict(X_test)
y_pred_proba1 = rf.predict_proba(X_test)

accuracy_score(y_test, y_pred1)

0.9462416745956232

confusion_matrix(y_test, y_pred1)

array([[1974,   18],
       [  95,   15]], dtype=int64)

Das Ergebnis war vorherzusehen, da das Ensemble aus Decision Trees aufgrund der Art des Lernens im Allgemeinen kein Oversampling und kein Skalieren braucht. Weiterhin wird der Original-Datensatz verwendet.

Hyperparameter Tuning¶

# Hyperparameter des Random Forest Klassifizierers
rf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

# Hyperparameter grid erstellen
param_grid = {
    'n_estimators': np.linspace(100, 150).astype(int),
    'max_depth': [None] + list(np.linspace(5, 30).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10]
            }

# Random Forest 
estimator = RandomForestClassifier(random_state = 42)

# Das Random Search Model
random = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'neg_mean_absolute_error', cv = 3, 
                        n_iter = 100, verbose = 0, random_state=42)

#print(random)

random.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None,
                                                    oob_sc...
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 150])},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='neg_mean_absolute_error',
                   verbose=0)

model = random.best_estimator_

#model

model.n_jobs = -1
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features=0.5, max_leaf_nodes=24,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=104,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

test_preds = model.predict(X_test)
pred_proba = model.predict_proba(X_test)[:,1]

pred_proba

array([0.01123034, 0.3738724 , 0.01504507, ..., 0.02188455, 0.03160822,
       0.06466828])

accuracy_score(y_test, test_preds)

0.9495718363463368

confusion_matrix(y_test, test_preds)

array([[1991,    1],
       [ 105,    5]], dtype=int64)

Ein Hauch besser als die Baseline

Test mit geringerem Threshold¶

from sklearn.preprocessing import binarize

# Threshold von 0.4
y_pred_ = model.predict_proba(X_test)

y_pred_class = binarize(y_pred_, 0.4)[:,1]

accuracy_score(y_test, y_pred_class)

0.9476688867745005

confusion_matrix(y_test, y_pred_class)

array([[1981,   11],
       [  99,   11]], dtype=int64)

Es kann sinnvoll sein, den Threshold zu verringern, um überhaupt positive Fälle zu finden.

Test mit höherem Threshold¶

from sklearn.preprocessing import binarize

# Threshold von 0.6
y_pred_ = model.predict_proba(X_test)

y_pred_class = binarize(y_pred_, 0.6)[:,1]

accuracy_score(y_test, y_pred_class)

0.9481446241674596

confusion_matrix(y_test, y_pred_class)

array([[1992,    0],
       [ 109,    1]], dtype=int64)

Feature Importances¶

fi = pd.Series(data=model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plt.rcParams['font.size'] = 15
plt.figure(figsize=(20,18))
plt.title("Feature Importances", fontsize = 30)
plt.ylabel('Variable', fontsize = 25); 
plt.xlabel('Importance', fontsize = 25); 

sns.barplot(y=fi.index, x=fi.values, palette="Blues_d", orient='h');

	Latitude	Longitude	NumMosquitos	dist_T900	dist_T115	dist_T138	NumMosq_3	Species_CULEX PIPIENS/RESTUANS	Species_CULEX RESTUANS	Sunrise	Sunset	Depart	Tmax	Tmin	Tavg	DewPoint	WetBulb	Cool	StnPressure	SeaLevel	ResultSpeed	ResultDir	AvgSpeed	Rain_lag_3	Wind_lag_3	Wind_lag_8	Wind_lag_15	Daylen	Year	Month	Day
0	41.954690	-87.800991	1	7.736592	35.441519	31.031386	1	1	0	421	1917	10.0	88.0	62.5	75.0	58.5	65.5	10.5	29.415	30.1	5.8	17.0	6.95	0.645	7.5	8.0	15.95	1496	2007	5	149
1	41.954690	-87.800991	1	7.736592	35.441519	31.031386	1	0	1	421	1917	10.0	88.0	62.5	75.0	58.5	65.5	10.5	29.415	30.1	5.8	17.0	6.95	0.645	7.5	8.0	15.95	1496	2007	5	149
2	41.994991	-87.769279	1	10.279811	38.414514	33.517161	1	0	1	421	1917	10.0	88.0	62.5	75.0	58.5	65.5	10.5	29.415	30.1	5.8	17.0	6.95	0.645	7.5	8.0	15.95	1496	2007	5	149
3	41.974089	-87.824812	1	5.440165	38.279145	33.931386	1	1	0	421	1917	10.0	88.0	62.5	75.0	58.5	65.5	10.5	29.415	30.1	5.8	17.0	6.95	0.645	7.5	8.0	15.95	1496	2007	5	149
4	41.974089	-87.824812	4	5.440165	38.279145	33.931386	64	0	1	421	1917	10.0	88.0	62.5	75.0	58.5	65.5	10.5	29.415	30.1	5.8	17.0	6.95	0.645	7.5	8.0	15.95	1496	2007	5	149