In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Telco Customer Churn')

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('Solarize_Light2')
In [12]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

# ML 
from sklearn.ensemble import RandomForestClassifier

Einlesen

In [3]:
df = pd.read_csv('df_clean.csv')

Aufteilen

In [4]:
x = df.drop('Churn', axis=1)
y = df.Churn
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[4]:
((5634, 24), (1409, 24), (5634,), (1409,))

RF Baseline

In [5]:
rf = RandomForestClassifier()

RF fitten

In [6]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

RF Score

In [7]:
f1_score(y_test, y_pred)
Out[7]:
0.5055999999999999
In [8]:
confusion_matrix(y_test, y_pred)
Out[8]:
array([[942,  94],
       [215, 158]], dtype=int64)

Hyperparameter Test

In [20]:
# Hyperparameter Grid
params = {'n_estimators': [10, 200, 500, 750],
             'max_depth': [None, 3, 5, 8 ],
             'max_features': [None, 2, 3]}

scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')
estimator = RandomForestClassifier(random_state = 42)

# Random Search Model
rs = RandomizedSearchCV(estimator, params, n_jobs = -1,
                        scoring = scorer, cv = 3,verbose = 2, random_state=42)

Vorhersage mit besten Hyperparametern

In [21]:
rs.fit(X_train, y_train)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.5s finished
Out[21]:
RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None,
                                                    oob_score=False,
                                                    random_state=42, verbose=0,
                                                    warm_start=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'max_depth': [None, 3, 5, 8],
                                        'max_features': [None, 2, 3],
                                        'n_estimators': [10, 200, 500, 750]},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(f1_score, average=micro), verbose=2)
In [23]:
model = rs.best_estimator_
model
Out[23]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
In [24]:
model.fit(X_train, y_train)
Out[24]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
In [25]:
test_preds = model.predict(X_test)
In [26]:
f1_score(y_test, test_preds)
Out[26]:
0.5478547854785478