In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Heart Disease')

import plotly_express as px

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')
In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, classification_report, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.svm import SVC
In [3]:
df = pd.read_csv('heart.csv')
df.shape
Out[3]:
(303, 14)
In [4]:
df.head(5)
Out[4]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1
In [5]:
x = df.drop('target', axis=1)
y = df.target
x.shape, y.shape
Out[5]:
((303, 13), (303,))
In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[6]:
((242, 13), (61, 13), (242,), (61,))

Standardisieren der Daten mit Standard Scaler

Mittelwert bei 0, Standardabweichung 1

In [11]:
sc = StandardScaler()
sc.fit(X_train)

X_train =sc.transform(X_train);
X_test =sc.transform(X_test);

Baseline Support Vector Machines Classifier

In [74]:
svc = SVC(probability=True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
y_pred_proba = svc.predict_proba(X_test)
In [75]:
accuracy_score(y_pred, y_test)
Out[75]:
0.5409836065573771

GridSearch für beste Hyperparameter

  • C Penalty Parameter of the error term. Ein kleiner Wert macht eine Hyperplane mit schmalem Margin.
  • kernel
    linear
    polynomial
    radial basis function: kann mit Daten umgehen, die nicht linear getrennt werden können
    sigmoid
  • gamma Ein kleinerer Wert passt sich lose an, ein höherer Wert von gamma passt sich genau an
In [12]:
svc.get_params()
Out[12]:
{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}
In [18]:
parameters = {'C': [1,10,100,1000],
             'gamma': [0.1, 0.001, 0.0001, 0.00001],
             'kernel': ['rbf', 'poly']}
In [19]:
grid_search = GridSearchCV(svc, parameters, cv=10, n_jobs=-1, scoring= 'accuracy', verbose=1)
grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)
Fitting 10 folds for each of 32 candidates, totalling 320 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
0.8140495867768595
{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    2.5s finished
C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning:

The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.

Das Model mit den besten Parametern testen

In [20]:
y_pred_grid_search = grid_search.predict(X_test)
In [21]:
accuracy_score(y_pred_grid_search, y_test)
Out[21]:
0.8852459016393442

Polynomial Features und Interaction Features

In [22]:
df_poly = pd.read_csv('df_poly.csv')
df_poly.head()
Out[22]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target 1 cp^2 cp thalach cp ca cp oldpeak cp thal cp age thalach^2 thalach ca thalach oldpeak thalach thal ... thalach oldpeak thal thalach oldpeak age thalach thal^2 thalach thal age thalach age^2 ca^3 ca^2 oldpeak ca^2 thal ca^2 age ca oldpeak^2 ca oldpeak thal ca oldpeak age ca thal^2 ca thal age ca age^2 oldpeak^3 oldpeak^2 thal oldpeak^2 age oldpeak thal^2 oldpeak thal age oldpeak age^2 thal^3 thal^2 age thal age^2 age^3
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1 1.0 9.0 450.0 0.0 6.9 3.0 189.0 22500.0 0.0 345.0 150.0 ... 345.0 21735.0 150.0 9450.0 595350.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.167 5.29 333.27 2.3 144.9 9128.7 1.0 63.0 3969.0 250047.0
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1 1.0 4.0 374.0 0.0 7.0 4.0 74.0 34969.0 0.0 654.5 374.0 ... 1309.0 24216.5 748.0 13838.0 256003.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 42.875 24.50 453.25 14.0 259.0 4791.5 8.0 148.0 2738.0 50653.0
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1 1.0 1.0 172.0 0.0 1.4 2.0 41.0 29584.0 0.0 240.8 344.0 ... 481.6 9872.8 688.0 14104.0 289132.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.744 3.92 80.36 5.6 114.8 2353.4 8.0 164.0 3362.0 68921.0
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1 1.0 1.0 178.0 0.0 0.8 2.0 56.0 31684.0 0.0 142.4 356.0 ... 284.8 7974.4 712.0 19936.0 558208.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.512 1.28 35.84 3.2 89.6 2508.8 8.0 224.0 6272.0 175616.0
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 26569.0 0.0 97.8 326.0 ... 195.6 5574.6 652.0 18582.0 529587.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.216 0.72 20.52 2.4 68.4 1949.4 8.0 228.0 6498.0 185193.0

5 rows × 92 columns

In [23]:
x_p = df_poly.drop('target', axis=1)
x_p.shape, y.shape
Out[23]:
((303, 91), (303,))
In [35]:
X_train_p, X_test_p, y_train, y_test = train_test_split(x_p, y, test_size=0.2, random_state=4)
X_train_p.shape, X_test_p.shape, y_train.shape, y_test.shape
Out[35]:
((242, 91), (61, 91), (242,), (61,))
In [36]:
sc.fit(X_train_p)
X_train_p =sc.transform(X_train_p);
X_test_p =sc.transform(X_test_p);

Mit default Parametern

In [37]:
svc_p = SVC()
In [38]:
svc_p.fit(X_train_p, y_train)
Out[38]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
In [39]:
y_p_pred = svc_p.predict(X_test_p)
In [40]:
accuracy_score(y_test, y_p_pred)
Out[40]:
0.8524590163934426

Mit besten Parametern

In [43]:
svc_p_ = SVC(C= 100, gamma= 0.0001, kernel= 'rbf', probability=True)
svc_p_.fit(X_train_p, y_train)
y_p_pred_ = svc_p_.predict(X_test_p)
accuracy_score(y_test, y_p_pred_)
Out[43]:
0.9016393442622951
In [78]:
print(classification_report(y_test, y_p_pred))
              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.86      0.89      0.88        36

   micro avg       0.85      0.85      0.85        61
   macro avg       0.85      0.84      0.85        61
weighted avg       0.85      0.85      0.85        61

Die Kombination aus besten Hyperparametern und Feature Engineering konnte das Ergebnis von 54% Accuracy auf 90% verbessern!

In [44]:
y_p_proba = svc_p_.predict_proba(X_test_p)

ROC Curves der vier Models

In [46]:
# Beste Hyperparameter, Polynomial Datensatz
fpr, tpr, thresholds = roc_curve(y_test, y_p_proba[:,1], drop_intermediate=False)
In [49]:
fpr0, tpr0, thresholds0 = roc_curve(y_test, y_pred_proba[:,1], drop_intermediate=False)
In [60]:
figsize(10, 7)
plt.rcParams['font.size'] = 20
plt.plot(fpr, tpr, label = 'Best Params')
plt.plot(fpr0, tpr0, label = 'Baseline')
plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), '--g', label="Random")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('ROC Curves Heart Disease')
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.grid(True)
plt.legend(loc="lower right")
Out[60]:
<matplotlib.legend.Legend at 0xf625a90>

Support Vectors

In [62]:
svc_p_.support_vectors_
Out[62]:
array([[-0.90393923,  0.71589105,  1.06418047, ..., -0.74480627,
        -0.89491602, -0.94642444],
       [-1.23945041,  0.71589105, -0.9168324 , ..., -0.8186411 ,
        -1.07562021, -1.17764332],
       [ 0.21443139,  0.71589105,  0.07367403, ...,  1.25488695,
         0.86877017,  0.06297952],
       ...,
       [ 0.7736167 ,  0.71589105,  1.06418047, ...,  1.53176754,
         1.45953389,  0.72212515],
       [-0.90393923,  0.71589105, -0.9168324 , ..., -0.74480627,
        -0.89491602, -0.94642444],
       [-1.01577629,  0.71589105,  0.07367403, ...,  0.64574963,
        -0.25616309, -1.0268964 ]])

Plotten geht nur mit kernel = linear