import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Klassifikation\Telco Customer Churn')
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('Solarize_Light2')
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
# ML
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv('df_clean.csv')
x = df.drop('Churn', axis=1)
y = df.Churn
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
f1_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)
# Hyperparameter Grid
params = {'n_estimators': [10, 200, 500, 750],
'max_depth': [None, 3, 5, 8 ],
'max_features': [None, 2, 3]}
scorer = make_scorer(f1_score, greater_is_better=True, average = 'micro')
estimator = RandomForestClassifier(random_state = 42)
# Random Search Model
rs = RandomizedSearchCV(estimator, params, n_jobs = -1,
scoring = scorer, cv = 3,verbose = 2, random_state=42)
rs.fit(X_train, y_train)
model = rs.best_estimator_
model
model.fit(X_train, y_train)
test_preds = model.predict(X_test)
f1_score(y_test, test_preds)