import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Klassifikation\Heart Disease')
import plotly_express as px
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, classification_report, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.svm import SVC
df = pd.read_csv('heart.csv')
df.shape
df.head(5)
x = df.drop('target', axis=1)
y = df.target
x.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Mittelwert bei 0, Standardabweichung 1
sc = StandardScaler()
sc.fit(X_train)
X_train =sc.transform(X_train);
X_test =sc.transform(X_test);
svc = SVC(probability=True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
y_pred_proba = svc.predict_proba(X_test)
accuracy_score(y_pred, y_test)
svc.get_params()
parameters = {'C': [1,10,100,1000],
'gamma': [0.1, 0.001, 0.0001, 0.00001],
'kernel': ['rbf', 'poly']}
grid_search = GridSearchCV(svc, parameters, cv=10, n_jobs=-1, scoring= 'accuracy', verbose=1)
grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)
Das Model mit den besten Parametern testen
y_pred_grid_search = grid_search.predict(X_test)
accuracy_score(y_pred_grid_search, y_test)
df_poly = pd.read_csv('df_poly.csv')
df_poly.head()
x_p = df_poly.drop('target', axis=1)
x_p.shape, y.shape
X_train_p, X_test_p, y_train, y_test = train_test_split(x_p, y, test_size=0.2, random_state=4)
X_train_p.shape, X_test_p.shape, y_train.shape, y_test.shape
sc.fit(X_train_p)
X_train_p =sc.transform(X_train_p);
X_test_p =sc.transform(X_test_p);
svc_p = SVC()
svc_p.fit(X_train_p, y_train)
y_p_pred = svc_p.predict(X_test_p)
accuracy_score(y_test, y_p_pred)
svc_p_ = SVC(C= 100, gamma= 0.0001, kernel= 'rbf', probability=True)
svc_p_.fit(X_train_p, y_train)
y_p_pred_ = svc_p_.predict(X_test_p)
accuracy_score(y_test, y_p_pred_)
print(classification_report(y_test, y_p_pred))
y_p_proba = svc_p_.predict_proba(X_test_p)
# Beste Hyperparameter, Polynomial Datensatz
fpr, tpr, thresholds = roc_curve(y_test, y_p_proba[:,1], drop_intermediate=False)
fpr0, tpr0, thresholds0 = roc_curve(y_test, y_pred_proba[:,1], drop_intermediate=False)
figsize(10, 7)
plt.rcParams['font.size'] = 20
plt.plot(fpr, tpr, label = 'Best Params')
plt.plot(fpr0, tpr0, label = 'Baseline')
plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), '--g', label="Random")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('ROC Curves Heart Disease')
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.grid(True)
plt.legend(loc="lower right")
svc_p_.support_vectors_
Plotten geht nur mit kernel = linear