import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Classification\Heart Disease')

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from IPython.core.pylabtools import figsize
figsize(10, 10)

import plotly.offline as py
import plotly.graph_objs as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('heart.csv')
df.shape

(303, 14)

df.head(5)

x = df.drop('target', axis=1)
y = df.target
x.shape, y.shape

((303, 13), (303,))

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

Standardisieren der Daten mit Standard Scaler¶

Mittelwert bei 0, Standardabweichung 1 Für einige der Algorithmen, wie Logistic Regression und Support Vector Machines, ist das Standardisieren wichtig, um optimale Ergebnisse zu erzielen.
Das Fitten geschieht ausschließlich auf X_train, transformiert werden dann X_train und X_test.

sc = StandardScaler()
sc.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

X_train =sc.transform(X_train);
X_test =sc.transform(X_test);

Baseline Model¶

Das Dummy Model sagt immer den Mittelwert der Zielvariablen voraus

# Create dummy classifer
dummy = DummyClassifier(strategy='uniform', random_state=1)

# "Train" model
dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=1, strategy='uniform')

# Get accuracy score
dummy.score(X_test, y_test)

0.4426229508196721

Baseline Logistic Regression¶

lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

y_pred = lr.predict(X_test)

accuracy_score(y_pred, y_test)

0.8524590163934426

Grid Search für beste Hyperparameter¶

c: je höher, desto weniger ist das Model regularized (default:1.0)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)

clf.fit(X_train, y_train)

C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:813: DeprecationWarning:

The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

clf.best_params_

{'C': 0.01}

# Logistic Regression it c= 0.01
lr_reg = LogisticRegression(C=0.01)
lr_reg.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

y_pred_reg = lr_reg.predict(X_test)

y_proba = lr_reg.predict_proba(X_test)

accuracy_score(y_test, y_pred_reg)

0.8852459016393442

Für die Roc Kurve brauche ich Thresholds, sonst gibt es nur einen Punkt und sie gibt Accuracy aus.

roc_auc_score(y_test, y_pred_reg)

0.8841594827586207

print(classification_report(y_test, y_pred_reg))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61

cm = confusion_matrix(y_test, y_pred_reg)

cm

array([[25,  4],
       [ 3, 29]], dtype=int64)

plt.rcParams['font.size'] = 20
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Set2', fmt=".1f"); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize=18);
ax.set_ylabel('True labels', fontsize=18); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['0', '1']); 
ax.yaxis.set_ticklabels(['0', '1']);
plt.tight_layout()

Welche Patienten wurden falsch zugeordnet?¶

results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred_reg, 'y_proba': y_proba[:,1]})

results = results.sort_index()

# alle falsch klassifizierten Patienten
results[(y_test == 1) == (y_pred_reg == 0)]

results[(y_test == 1) & (y_pred_reg == 0)]

df.iloc[[42, 101, 139], :]

# Median der Werte über 0,5
results.loc[results.y_proba > 0.5].median()

y_test     1.000000
y_pred     1.000000
y_proba    0.634712
dtype: float64

results.loc[results.y_proba > 0.5].min()

y_test     0.000000
y_pred     1.000000
y_proba    0.506458
dtype: float64

# Es gibt keine Werte bei 0.5, daher ist das kde Plot unten irgendwie irreführend.

figsize(10, 8)
sns.kdeplot(results.y_proba,  shade = True)
plt.axvline(0.5, 0, 1, color='red')

<matplotlib.lines.Line2D at 0xdbb44a8>

figsize(10, 8)
plt.hist(results.y_proba, bins = 35)
plt.axvline(0.5, 0, 1, color='red')

<matplotlib.lines.Line2D at 0xdecbe48>

# Recall berechnen
tpr = cm[1,1]/(cm[1,1]+cm[1,0])
tpr

0.90625

fpr= cm[0,1]/(cm[0,1]+cm[0,0])
fpr

0.13793103448275862

y_pred_prob = lr_reg.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, drop_intermediate=False)

thresholds

array([1.82761476, 0.82761476, 0.81561405, 0.77176793, 0.76012406,
       0.75441072, 0.74483677, 0.72700317, 0.71953579, 0.70609397,
       0.6924657 , 0.69191742, 0.69102066, 0.68246915, 0.6821203 ,
       0.67215224, 0.67173783, 0.63471191, 0.62922645, 0.62329438,
       0.61512563, 0.61164029, 0.60441336, 0.60192246, 0.58041964,
       0.57908619, 0.57658788, 0.57426729, 0.5564351 , 0.55032692,
       0.54895029, 0.53400122, 0.51931432, 0.50645842, 0.4426297 ,
       0.34051001, 0.3231265 , 0.32030757, 0.31697003, 0.31344301,
       0.31325635, 0.31064888, 0.29562134, 0.29010399, 0.26999105,
       0.25256104, 0.23956276, 0.23577731, 0.23201928, 0.22262163,
       0.2118171 , 0.20131673, 0.19987921, 0.18656378, 0.17940912,
       0.17239143, 0.16734111, 0.15850851, 0.1426913 , 0.13795365,
       0.10795221, 0.10593281])

Es scheint verwunderlich, dass der erste Werte von Thresholds über 1 liegt. Dies ist kein Bug, sondern soll so sein, damit der Wert 0 für TPR und FPR angezeigt werden kann.

figsize(10, 7)
plt.plot(fpr, tpr)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('ROC Curve Heart Disease')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

fpr

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.03448276, 0.03448276, 0.03448276, 0.06896552,
       0.06896552, 0.06896552, 0.06896552, 0.06896552, 0.10344828,
       0.13793103, 0.13793103, 0.13793103, 0.13793103, 0.13793103,
       0.13793103, 0.13793103, 0.13793103, 0.13793103, 0.17241379,
       0.20689655, 0.24137931, 0.24137931, 0.27586207, 0.31034483,
       0.34482759, 0.37931034, 0.37931034, 0.4137931 , 0.44827586,
       0.48275862, 0.51724138, 0.55172414, 0.5862069 , 0.5862069 ,
       0.62068966, 0.65517241, 0.68965517, 0.72413793, 0.75862069,
       0.79310345, 0.82758621, 0.86206897, 0.89655172, 0.93103448,
       0.96551724, 1.        ])

plt.subplots(figsize=(15, 8))
plt.plot(fpr, tpr, 'o-', label="ROC curve")
plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), label="diagonal")

for x, y, txt in zip(fpr[::5], tpr[::5], thresholds[::5]):
    plt.annotate(np.round(txt,2), (x, y-0.04))
rnd_idx = 27
plt.annotate( 'this point refers to the tpr and the fpr\n at a probability threshold of {}'.format(np.round(thresholds[rnd_idx], 2)),
             xy=(fpr[rnd_idx], tpr[rnd_idx]), xytext=(fpr[rnd_idx]+0.2, tpr[rnd_idx]-0.25),
             arrowprops=dict(facecolor='black', lw=2, arrowstyle='->'),)

plt.legend(loc="upper left")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

Text(0, 0.5, 'True Positive Rate')

print(tuple(zip(fpr[::5], tpr[::5], thresholds[::5])))

((0.0, 0.0, 1.8276147559473768), (0.0, 0.15625, 0.7544107230701607), (0.0, 0.3125, 0.6924657001385295), (0.0, 0.46875, 0.6721522398983056), (0.06896551724137931, 0.5625, 0.615125634698659), (0.13793103448275862, 0.65625, 0.5790861931748899), (0.13793103448275862, 0.8125, 0.5489502854374858), (0.20689655172413793, 0.90625, 0.34051000985373364), (0.3448275862068966, 0.9375, 0.3132563526280901), (0.4827586206896552, 0.96875, 0.25256103676597336), (0.6206896551724138, 1.0, 0.21181709901921705), (0.7931034482758621, 1.0, 0.1723914336750182), (0.9655172413793104, 1.0, 0.10795221374734966))

ROC Curve mit Plotly¶

trace1 = go.Scatter(x=fpr, y=tpr, 
                    mode='lines', 
                    line=dict(color='darkorange'),
                    name='ROC curve'
                   )
trace2 = go.Scatter(x=[-0.05, 1.05], y=[-0.05, 1.05], 
                    mode='lines', 
                    line=dict(color='navy',  dash='dash'),
                    showlegend=False)

layout = go.Layout(title='Receiver operating characteristic Heart Disease',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))

fig = go.Figure(data=[trace1, trace2], layout=layout)

py.iplot(fig)

Test mit class_weight = 'balanced'¶

lr_bal = LogisticRegression(C=0.01, class_weight='balanced')
lr_bal.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

y_pred_bal = lr_bal.predict(X_test)

accuracy_score(y_pred_bal, y_test)

0.8688524590163934

Logistic Regression mit C = 0.01 hat die beste Accuracy von 0.885¶

Model aus Datensatz mit Polynomial und Interaction Features¶

df_poly = pd.read_csv('df_poly.csv')
df_poly.head()

x_p = df_poly.drop('target', axis=1)
y = df_poly.target
x_p.shape, y.shape

((303, 91), (303,))

X_train_p, X_test_p, y_train, y_test = train_test_split(x_p, y, test_size=0.2, random_state=42)
X_train_p.shape, X_test_p.shape, y_train.shape, y_test.shape

((242, 91), (61, 91), (242,), (61,))

Standardisieren des Polynomial Datensatzes

sc.fit(X_train_p)
X_train_p =sc.transform(X_train_p);
X_test_p =sc.transform(X_test_p);

lr_reg_p = LogisticRegression(C=0.01)
lr_reg_p.fit(X_train_p, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

y_p_pred = lr_reg_p.predict(X_test_p)

accuracy_score(y_p_pred, y_test)

0.8852459016393442

Mit Polynomial Features performt das Model auf dem Test-Set gleich.

Verschiedene Thresholds testen, um optimalen Recall zu finden¶

Bei der Auswahl einer Metrik für die Güte eines Models ist die Problemstellung von großer Bedeutung. Soll vorhergesagt werden, ob ein Patient krank oder gesund ist, ist die Metrik Recall wichtiger als der Accuracy-Score. Der Grund hierfür ist, dass lieber einige gesunde Patienten weiter getestet werden, als das ein kranker Patient als gesund eingestuft wird.

Zudem sollen nun nicht die Klassen direkt vorhergesagt werden, sondern die Wahrscheinlichkeit für die Zugehörigkeit zu einer Klasse. So können Risikogruppen identifiziert werden.

# Dataframe aus den Vorhersagen in Wahrscheinlichkeiten
pred_proba_df = pd.DataFrame(lr_reg.predict_proba(X_test))

# Liste von Entscheidungsgrenzen
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
tpr_list = []
fpr_list = []

for i in threshold_list:
    print ('\n****** For threshold = {} ******'.format(i))
    y_test_pred = pred_proba_df.applymap(lambda x: 1 if x > i else 0)
    
    # y_test und Spalte für 1 aus y_pred umwandeln, Accuracy Score aller Werte i berechnen
    test_accuracy = accuracy_score(y_test.as_matrix().reshape(y_test.as_matrix().size, 1),
                                   y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size, 1))
    print('Testing Accuracy: {}'.format(test_accuracy))
    
    # Confusion Matrix für alle Werte i
    con = confusion_matrix(y_test.as_matrix().reshape(y_test.as_matrix().size, 1),
                           y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size, 1))
   
    tpr = con[1,1]/(con[1,1]+con[1,0])
    fpr = con[0,1]/(con[0,1]+con[0,0])
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print('Confusion Matrix: ')
    print(con)
    
    print('True Positve Rate: ', tpr)
    print('False Positive Rate: ', fpr)

****** For threshold = 0.05 ******
Testing Accuracy: 0.5245901639344263
Confusion Matrix: 
[[ 0 29]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  1.0

****** For threshold = 0.1 ******
Testing Accuracy: 0.5245901639344263
Confusion Matrix: 
[[ 0 29]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  1.0

****** For threshold = 0.15 ******
Testing Accuracy: 0.5901639344262295
Confusion Matrix: 
[[ 4 25]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  0.8620689655172413

****** For threshold = 0.2 ******
Testing Accuracy: 0.6885245901639344
Confusion Matrix: 
[[10 19]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  0.6551724137931034

****** For threshold = 0.25 ******
Testing Accuracy: 0.7540983606557377
Confusion Matrix: 
[[15 14]
 [ 1 31]]
True Positve Rate:  0.96875
False Positive Rate:  0.4827586206896552

****** For threshold = 0.3 ******
Testing Accuracy: 0.7868852459016393
Confusion Matrix: 
[[18 11]
 [ 2 30]]
True Positve Rate:  0.9375
False Positive Rate:  0.3793103448275862

****** For threshold = 0.35 ******
Testing Accuracy: 0.8688524590163934
Confusion Matrix: 
[[24  5]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.1724137931034483

****** For threshold = 0.4 ******
Testing Accuracy: 0.8688524590163934
Confusion Matrix: 
[[24  5]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.1724137931034483

****** For threshold = 0.45 ******
Testing Accuracy: 0.8852459016393442
Confusion Matrix: 
[[25  4]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.13793103448275862

****** For threshold = 0.5 ******
Testing Accuracy: 0.8852459016393442
Confusion Matrix: 
[[25  4]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.13793103448275862

****** For threshold = 0.55 ******
Testing Accuracy: 0.819672131147541
Confusion Matrix: 
[[25  4]
 [ 7 25]]
True Positve Rate:  0.78125
False Positive Rate:  0.13793103448275862

****** For threshold = 0.6 ******
Testing Accuracy: 0.7868852459016393
Confusion Matrix: 
[[27  2]
 [11 21]]
True Positve Rate:  0.65625
False Positive Rate:  0.06896551724137931

****** For threshold = 0.65 ******
Testing Accuracy: 0.7049180327868853
Confusion Matrix: 
[[28  1]
 [17 15]]
True Positve Rate:  0.46875
False Positive Rate:  0.034482758620689655

****** For threshold = 0.7 ******
Testing Accuracy: 0.6229508196721312
Confusion Matrix: 
[[29  0]
 [23  9]]
True Positve Rate:  0.28125
False Positive Rate:  0.0

****** For threshold = 0.75 ******
Testing Accuracy: 0.5573770491803278
Confusion Matrix: 
[[29  0]
 [27  5]]
True Positve Rate:  0.15625
False Positive Rate:  0.0

****** For threshold = 0.8 ******
Testing Accuracy: 0.5081967213114754
Confusion Matrix: 
[[29  0]
 [30  2]]
True Positve Rate:  0.0625
False Positive Rate:  0.0

****** For threshold = 0.85 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

****** For threshold = 0.9 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

****** For threshold = 0.95 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

****** For threshold = 0.99 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

rev = [i for i in threshold_list[::-1]]
rev

[0.99,
 0.95,
 0.9,
 0.85,
 0.8,
 0.75,
 0.7,
 0.65,
 0.6,
 0.55,
 0.5,
 0.45,
 0.4,
 0.35,
 0.3,
 0.25,
 0.2,
 0.15,
 0.1,
 0.05]

[i for i in reversed(threshold_list)]

[0.99,
 0.95,
 0.9,
 0.85,
 0.8,
 0.75,
 0.7,
 0.65,
 0.6,
 0.55,
 0.5,
 0.45,
 0.4,
 0.35,
 0.3,
 0.25,
 0.2,
 0.15,
 0.1,
 0.05]

rates = pd.DataFrame({'Threshold' : threshold_list,
                     'True Positive Rate': tpr_list,
                     'False Positive Rate': fpr_list})

rates

rate = rates.drop([0, 1, 6, 7, 17, 18, 19])

rate = rate.reset_index(drop=True)

rate

tpr = rate['True Positive Rate']
fpr = rate['False Positive Rate']
th = rate['Threshold']
figsize(15, 15)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.scatter(fpr, tpr)

for i, txt in enumerate(th):
   plt.annotate(txt, (fpr[i]+0.01, tpr[i]))

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	ca	thal	target
42	45	1	0	104	208	0	148	1	3.0	1	0	2	1
101	59	1	3	178	270	0	145	0	4.2	0	0	3	1
139	64	1	0	128	263	1	105	1	0.2	1	1	3	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target	1	cp^2	cp thalach	cp oldpeak	cp thal	cp age	thalach^2	thalach oldpeak	thalach thal	...	thalach oldpeak thal	thalach oldpeak age	thalach thal^2	thalach thal age	thalach age^2	oldpeak^3	oldpeak^2 thal	oldpeak^2 age	oldpeak thal^2	oldpeak thal age	oldpeak age^2	thal^3	thal^2 age	thal age^2	age^3
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1	1.0	9.0	450.0	6.9	3.0	189.0	22500.0	345.0	150.0	...	345.0	21735.0	150.0	9450.0	595350.0	12.167	5.29	333.27	2.3	144.9	9128.7	1.0	63.0	3969.0	250047.0
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1	1.0	4.0	374.0	7.0	4.0	74.0	34969.0	654.5	374.0	...	1309.0	24216.5	748.0	13838.0	256003.0	42.875	24.50	453.25	14.0	259.0	4791.5	8.0	148.0	2738.0	50653.0
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1	1.0	1.0	172.0	1.4	2.0	41.0	29584.0	240.8	344.0	...	481.6	9872.8	688.0	14104.0	289132.0	2.744	3.92	80.36	5.6	114.8	2353.4	8.0	164.0	3362.0	68921.0
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1	1.0	1.0	178.0	0.8	2.0	56.0	31684.0	142.4	356.0	...	284.8	7974.4	712.0	19936.0	558208.0	0.512	1.28	35.84	3.2	89.6	2508.8	8.0	224.0	6272.0	175616.0
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1	1.0	0.0	0.0	0.0	0.0	0.0	26569.0	97.8	326.0	...	195.6	5574.6	652.0	18582.0	529587.0	0.216	0.72	20.52	2.4	68.4	1949.4	8.0	228.0	6498.0	185193.0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	y_test	y_pred	y_proba
42	1	0	0.320308
101	1	0	0.295621
139	1	0	0.222622
228	0	1	0.580420
254	0	1	0.671738
281	0	1	0.579086
283	0	1	0.623294

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	ca	thal	target
42	45	1	0	104	208	0	148	1	3.0	1	0	2	1
101	59	1	3	178	270	0	145	0	4.2	0	0	3	1
139	64	1	0	128	263	1	105	1	0.2	1	1	3	1

	Threshold	True Positive Rate	False Positive Rate
0	0.05	1.00000	1.000000
1	0.10	1.00000	1.000000
2	0.15	1.00000	0.862069
3	0.20	1.00000	0.655172
4	0.25	0.96875	0.482759
5	0.30	0.93750	0.379310
6	0.35	0.90625	0.172414
7	0.40	0.90625	0.172414
8	0.45	0.90625	0.137931
9	0.50	0.90625	0.137931
10	0.55	0.78125	0.137931
11	0.60	0.65625	0.068966
12	0.65	0.46875	0.034483
13	0.70	0.28125	0.000000
14	0.75	0.15625	0.000000
15	0.80	0.06250	0.000000
16	0.85	0.00000	0.000000
17	0.90	0.00000	0.000000
18	0.95	0.00000	0.000000
19	0.99	0.00000	0.000000

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	ca	thal	target
42	45	1	0	104	208	0	148	1	3.0	1	0	2	1
101	59	1	3	178	270	0	145	0	4.2	0	0	3	1
139	64	1	0	128	263	1	105	1	0.2	1	1	3	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	ca	thal	target
42	45	1	0	104	208	0	148	1	3.0	1	0	2	1
101	59	1	3	178	270	0	145	0	4.2	0	0	3	1
139	64	1	0	128	263	1	105	1	0.2	1	1	3	1