import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Heart Disease')

import plotly_express as px

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, classification_report, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.svm import SVC

df = pd.read_csv('heart.csv')
df.shape

(303, 14)

df.head(5)

x = df.drop('target', axis=1)
y = df.target
x.shape, y.shape

((303, 13), (303,))

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

Standardisieren der Daten mit Standard Scaler¶

Mittelwert bei 0, Standardabweichung 1

sc = StandardScaler()
sc.fit(X_train)

X_train =sc.transform(X_train);
X_test =sc.transform(X_test);

Baseline Support Vector Machines Classifier¶

svc = SVC(probability=True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
y_pred_proba = svc.predict_proba(X_test)

accuracy_score(y_pred, y_test)

0.5409836065573771

GridSearch für beste Hyperparameter¶

C Penalty Parameter of the error term. Ein kleiner Wert macht eine Hyperplane mit schmalem Margin.
kernel
linear
polynomial
radial basis function: kann mit Daten umgehen, die nicht linear getrennt werden können
sigmoid
gamma Ein kleinerer Wert passt sich lose an, ein höherer Wert von gamma passt sich genau an

svc.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

parameters = {'C': [1,10,100,1000],
             'gamma': [0.1, 0.001, 0.0001, 0.00001],
             'kernel': ['rbf', 'poly']}

grid_search = GridSearchCV(svc, parameters, cv=10, n_jobs=-1, scoring= 'accuracy', verbose=1)
grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 10 folds for each of 32 candidates, totalling 320 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

0.8140495867768595
{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}

[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    2.5s finished
C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning:

The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.

Das Model mit den besten Parametern testen

y_pred_grid_search = grid_search.predict(X_test)

accuracy_score(y_pred_grid_search, y_test)

0.8852459016393442

Polynomial Features und Interaction Features¶

df_poly = pd.read_csv('df_poly.csv')
df_poly.head()

x_p = df_poly.drop('target', axis=1)
x_p.shape, y.shape

((303, 91), (303,))

X_train_p, X_test_p, y_train, y_test = train_test_split(x_p, y, test_size=0.2, random_state=4)
X_train_p.shape, X_test_p.shape, y_train.shape, y_test.shape

((242, 91), (61, 91), (242,), (61,))

sc.fit(X_train_p)
X_train_p =sc.transform(X_train_p);
X_test_p =sc.transform(X_test_p);

Mit default Parametern¶

svc_p = SVC()

svc_p.fit(X_train_p, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

y_p_pred = svc_p.predict(X_test_p)

accuracy_score(y_test, y_p_pred)

0.8524590163934426

Mit besten Parametern¶

svc_p_ = SVC(C= 100, gamma= 0.0001, kernel= 'rbf', probability=True)
svc_p_.fit(X_train_p, y_train)
y_p_pred_ = svc_p_.predict(X_test_p)
accuracy_score(y_test, y_p_pred_)

0.9016393442622951

print(classification_report(y_test, y_p_pred))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.86      0.89      0.88        36

   micro avg       0.85      0.85      0.85        61
   macro avg       0.85      0.84      0.85        61
weighted avg       0.85      0.85      0.85        61

Die Kombination aus besten Hyperparametern und Feature Engineering konnte das Ergebnis von 54% Accuracy auf 90% verbessern!¶

y_p_proba = svc_p_.predict_proba(X_test_p)

ROC Curves der vier Models¶

# Beste Hyperparameter, Polynomial Datensatz
fpr, tpr, thresholds = roc_curve(y_test, y_p_proba[:,1], drop_intermediate=False)

fpr0, tpr0, thresholds0 = roc_curve(y_test, y_pred_proba[:,1], drop_intermediate=False)

figsize(10, 7)
plt.rcParams['font.size'] = 20
plt.plot(fpr, tpr, label = 'Best Params')
plt.plot(fpr0, tpr0, label = 'Baseline')
plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), '--g', label="Random")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('ROC Curves Heart Disease')
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.grid(True)
plt.legend(loc="lower right")

<matplotlib.legend.Legend at 0xf625a90>

Support Vectors¶

svc_p_.support_vectors_

array([[-0.90393923,  0.71589105,  1.06418047, ..., -0.74480627,
        -0.89491602, -0.94642444],
       [-1.23945041,  0.71589105, -0.9168324 , ..., -0.8186411 ,
        -1.07562021, -1.17764332],
       [ 0.21443139,  0.71589105,  0.07367403, ...,  1.25488695,
         0.86877017,  0.06297952],
       ...,
       [ 0.7736167 ,  0.71589105,  1.06418047, ...,  1.53176754,
         1.45953389,  0.72212515],
       [-0.90393923,  0.71589105, -0.9168324 , ..., -0.74480627,
        -0.89491602, -0.94642444],
       [-1.01577629,  0.71589105,  0.07367403, ...,  0.64574963,
        -0.25616309, -1.0268964 ]])

Plotten geht nur mit kernel = linear

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target	1	cp^2	cp thalach	cp oldpeak	cp thal	cp age	thalach^2	thalach oldpeak	thalach thal	...	thalach oldpeak thal	thalach oldpeak age	thalach thal^2	thalach thal age	thalach age^2	oldpeak^3	oldpeak^2 thal	oldpeak^2 age	oldpeak thal^2	oldpeak thal age	oldpeak age^2	thal^3	thal^2 age	thal age^2	age^3
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1	1.0	9.0	450.0	6.9	3.0	189.0	22500.0	345.0	150.0	...	345.0	21735.0	150.0	9450.0	595350.0	12.167	5.29	333.27	2.3	144.9	9128.7	1.0	63.0	3969.0	250047.0
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1	1.0	4.0	374.0	7.0	4.0	74.0	34969.0	654.5	374.0	...	1309.0	24216.5	748.0	13838.0	256003.0	42.875	24.50	453.25	14.0	259.0	4791.5	8.0	148.0	2738.0	50653.0
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1	1.0	1.0	172.0	1.4	2.0	41.0	29584.0	240.8	344.0	...	481.6	9872.8	688.0	14104.0	289132.0	2.744	3.92	80.36	5.6	114.8	2353.4	8.0	164.0	3362.0	68921.0
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1	1.0	1.0	178.0	0.8	2.0	56.0	31684.0	142.4	356.0	...	284.8	7974.4	712.0	19936.0	558208.0	0.512	1.28	35.84	3.2	89.6	2508.8	8.0	224.0	6272.0	175616.0
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1	1.0	0.0	0.0	0.0	0.0	0.0	26569.0	97.8	326.0	...	195.6	5574.6	652.0	18582.0	529587.0	0.216	0.72	20.52	2.4	68.4	1949.4	8.0	228.0	6498.0	185193.0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1