In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Klassifikation_West Nile Virus')

import plotly_express as px

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
from IPython.core.pylabtools import figsize

import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
In [2]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, classification_report, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\externals\six.py:31: DeprecationWarning:

The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).

Einlesen des kombinierten Datensatzes

In [3]:
df = pd.read_csv('combined.csv')
df.head()
Out[3]:
Latitude Longitude NumMosquitos WnvPresent dist_T900 dist_T115 dist_T138 NumMosq_3 Species_CULEX PIPIENS Species_CULEX PIPIENS/RESTUANS Species_CULEX RESTUANS Species_CULEX SALINARIUS Species_CULEX TARSALIS Species_CULEX TERRITANS Sunrise Sunset Depart Tmax Tmin Tavg DewPoint WetBulb Heat Cool PrecipTotal StnPressure SeaLevel ResultSpeed ResultDir AvgSpeed Rain_prev Hot_Days Rain_lag_3 Rain_lag_8 Rain_lag_15 Wind_lag_3 Wind_lag_8 Wind_lag_15 Daylen Spray Year Month Day
0 41.954690 -87.800991 1 0 7.736592 35.441519 31.031386 1 0 1 0 0 0 0 421 1917 10.0 88.0 62.5 75.0 58.5 65.5 0.0 10.5 0.0 29.415 30.1 5.8 17.0 6.95 0.0 0 0.645 0.0 0.0 7.5 8.0 15.95 1496 0 2007 5 149
1 41.954690 -87.800991 1 0 7.736592 35.441519 31.031386 1 0 0 1 0 0 0 421 1917 10.0 88.0 62.5 75.0 58.5 65.5 0.0 10.5 0.0 29.415 30.1 5.8 17.0 6.95 0.0 0 0.645 0.0 0.0 7.5 8.0 15.95 1496 0 2007 5 149
2 41.994991 -87.769279 1 0 10.279811 38.414514 33.517161 1 0 0 1 0 0 0 421 1917 10.0 88.0 62.5 75.0 58.5 65.5 0.0 10.5 0.0 29.415 30.1 5.8 17.0 6.95 0.0 0 0.645 0.0 0.0 7.5 8.0 15.95 1496 0 2007 5 149
3 41.974089 -87.824812 1 0 5.440165 38.279145 33.931386 1 0 1 0 0 0 0 421 1917 10.0 88.0 62.5 75.0 58.5 65.5 0.0 10.5 0.0 29.415 30.1 5.8 17.0 6.95 0.0 0 0.645 0.0 0.0 7.5 8.0 15.95 1496 0 2007 5 149
4 41.974089 -87.824812 4 0 5.440165 38.279145 33.931386 64 0 0 1 0 0 0 421 1917 10.0 88.0 62.5 75.0 58.5 65.5 0.0 10.5 0.0 29.415 30.1 5.8 17.0 6.95 0.0 0 0.645 0.0 0.0 7.5 8.0 15.95 1496 0 2007 5 149

Aufteilen des Datensatzes in Train und Test

In [4]:
x = df.drop('WnvPresent', axis=1)
y = df.WnvPresent
In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[5]:
((8404, 42), (2102, 42), (8404,), (2102,))

Resampling und Skalieren

In [6]:
sm = SMOTE()

X_train_s, y_train_s = sm.fit_sample(X_train, y_train)

print(y_train.value_counts())
print(np.bincount(y_train_s))
0    7963
1     441
Name: WnvPresent, dtype: int64
[7963 7963]
In [7]:
sc = StandardScaler()
sc.fit(X_train_s)
X_train_s = sc.transform(X_train_s)
X_tests = sc.transform(X_test)

Baseline

Vorhersage der häufigeren Klasse. Wenn immer 0 vorhergesagt wird, ist die Accuracy bei fast 95%

In [8]:
1-((df.WnvPresent).mean())
Out[8]:
0.9475537787930707

Random Forest Classifier Baseline Resampled und Skaliert

In [9]:
rf = RandomForestClassifier(random_state=42)
In [10]:
rf.fit(X_train_s, y_train_s)
y_pred = rf.predict(X_tests)
y_pred_proba = rf.predict_proba(X_tests)
In [11]:
accuracy_score(y_test, y_pred)
Out[11]:
0.9333967649857279
In [12]:
confusion_matrix(y_test, y_pred)
Out[12]:
array([[1940,   52],
       [  88,   22]], dtype=int64)

Random Forest Classifier Baseline Original Datensatz

In [13]:
rf.fit(X_train, y_train)
y_pred1 = rf.predict(X_test)
y_pred_proba1 = rf.predict_proba(X_test)
In [14]:
accuracy_score(y_test, y_pred1)
Out[14]:
0.9462416745956232
In [15]:
confusion_matrix(y_test, y_pred1)
Out[15]:
array([[1974,   18],
       [  95,   15]], dtype=int64)

Das Ergebnis war vorherzusehen, da das Ensemble aus Decision Trees aufgrund der Art des Lernens im Allgemeinen kein Oversampling und kein Skalieren braucht. Weiterhin wird der Original-Datensatz verwendet.

Hyperparameter Tuning

In [16]:
# Hyperparameter des Random Forest Klassifizierers
rf.get_params()
Out[16]:
{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}
In [17]:
# Hyperparameter grid erstellen
param_grid = {
    'n_estimators': np.linspace(100, 150).astype(int),
    'max_depth': [None] + list(np.linspace(5, 30).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10]
            }

# Random Forest 
estimator = RandomForestClassifier(random_state = 42)

# Das Random Search Model
random = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'neg_mean_absolute_error', cv = 3, 
                        n_iter = 100, verbose = 0, random_state=42)
In [18]:
#print(random)
In [19]:
random.fit(X_train, y_train)
Out[19]:
RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None,
                                                    oob_sc...
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 150])},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='neg_mean_absolute_error',
                   verbose=0)
In [20]:
model = random.best_estimator_
In [21]:
#model
In [22]:
model.n_jobs = -1
model.fit(X_train, y_train)
Out[22]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features=0.5, max_leaf_nodes=24,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=104,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
In [23]:
test_preds = model.predict(X_test)
pred_proba = model.predict_proba(X_test)[:,1]
In [24]:
pred_proba
Out[24]:
array([0.01123034, 0.3738724 , 0.01504507, ..., 0.02188455, 0.03160822,
       0.06466828])
In [25]:
accuracy_score(y_test, test_preds)
Out[25]:
0.9495718363463368
In [33]:
confusion_matrix(y_test, test_preds)
Out[33]:
array([[1991,    1],
       [ 105,    5]], dtype=int64)

Ein Hauch besser als die Baseline

Test mit geringerem Threshold

In [26]:
from sklearn.preprocessing import binarize
In [27]:
# Threshold von 0.4
y_pred_ = model.predict_proba(X_test)
In [28]:
y_pred_class = binarize(y_pred_, 0.4)[:,1]
In [30]:
accuracy_score(y_test, y_pred_class)
Out[30]:
0.9476688867745005
In [32]:
confusion_matrix(y_test, y_pred_class)
Out[32]:
array([[1981,   11],
       [  99,   11]], dtype=int64)

Es kann sinnvoll sein, den Threshold zu verringern, um überhaupt positive Fälle zu finden.

Test mit höherem Threshold

In [34]:
from sklearn.preprocessing import binarize
In [35]:
# Threshold von 0.6
y_pred_ = model.predict_proba(X_test)
In [36]:
y_pred_class = binarize(y_pred_, 0.6)[:,1]
In [37]:
accuracy_score(y_test, y_pred_class)
Out[37]:
0.9481446241674596
In [38]:
confusion_matrix(y_test, y_pred_class)
Out[38]:
array([[1992,    0],
       [ 109,    1]], dtype=int64)

Feature Importances

In [42]:
fi = pd.Series(data=model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plt.rcParams['font.size'] = 15
plt.figure(figsize=(20,18))
plt.title("Feature Importances", fontsize = 30)
plt.ylabel('Variable', fontsize = 25); 
plt.xlabel('Importance', fontsize = 25); 

sns.barplot(y=fi.index, x=fi.values, palette="Blues_d", orient='h');