In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Classification\Heart Disease')

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from IPython.core.pylabtools import figsize
figsize(10, 10)
In [2]:
import plotly.offline as py
import plotly.graph_objs as go
In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
In [4]:
df = pd.read_csv('heart.csv')
df.shape
Out[4]:
(303, 14)
In [5]:
df.head(5)
Out[5]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1
In [6]:
x = df.drop('target', axis=1)
y = df.target
x.shape, y.shape
Out[6]:
((303, 13), (303,))
In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[7]:
((242, 13), (61, 13), (242,), (61,))

Standardisieren der Daten mit Standard Scaler

Mittelwert bei 0, Standardabweichung 1 Für einige der Algorithmen, wie Logistic Regression und Support Vector Machines, ist das Standardisieren wichtig, um optimale Ergebnisse zu erzielen.
Das Fitten geschieht ausschließlich auf X_train, transformiert werden dann X_train und X_test.

In [8]:
sc = StandardScaler()
sc.fit(X_train)
Out[8]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [9]:
X_train =sc.transform(X_train);
X_test =sc.transform(X_test);

Baseline Model

Das Dummy Model sagt immer den Mittelwert der Zielvariablen voraus

In [10]:
# Create dummy classifer
dummy = DummyClassifier(strategy='uniform', random_state=1)

# "Train" model
dummy.fit(X_train, y_train)
Out[10]:
DummyClassifier(constant=None, random_state=1, strategy='uniform')
In [11]:
# Get accuracy score
dummy.score(X_test, y_test) 
Out[11]:
0.4426229508196721

Baseline Logistic Regression

In [12]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
Out[12]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [13]:
y_pred = lr.predict(X_test)
In [14]:
accuracy_score(y_pred, y_test)
Out[14]:
0.8524590163934426

Grid Search für beste Hyperparameter

c: je höher, desto weniger ist das Model regularized (default:1.0)

In [15]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
In [16]:
clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
In [17]:
clf.fit(X_train, y_train)
C:\Users\Leo\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:813: DeprecationWarning:

The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.

Out[17]:
GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [18]:
clf.best_params_
Out[18]:
{'C': 0.01}
In [19]:
# Logistic Regression it c= 0.01
lr_reg = LogisticRegression(C=0.01)
lr_reg.fit(X_train, y_train)
Out[19]:
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [20]:
y_pred_reg = lr_reg.predict(X_test)
In [21]:
y_proba = lr_reg.predict_proba(X_test)
In [22]:
accuracy_score(y_test, y_pred_reg)
Out[22]:
0.8852459016393442

Für die Roc Kurve brauche ich Thresholds, sonst gibt es nur einen Punkt und sie gibt Accuracy aus.

In [23]:
roc_auc_score(y_test, y_pred_reg) 
Out[23]:
0.8841594827586207
In [24]:
print(classification_report(y_test, y_pred_reg))
              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61

In [25]:
cm = confusion_matrix(y_test, y_pred_reg)
In [26]:
cm
Out[26]:
array([[25,  4],
       [ 3, 29]], dtype=int64)
In [27]:
plt.rcParams['font.size'] = 20
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Set2', fmt=".1f"); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize=18);
ax.set_ylabel('True labels', fontsize=18); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['0', '1']); 
ax.yaxis.set_ticklabels(['0', '1']);
plt.tight_layout()

Welche Patienten wurden falsch zugeordnet?

In [28]:
results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred_reg, 'y_proba': y_proba[:,1]})
In [29]:
results = results.sort_index()
In [30]:
# alle falsch klassifizierten Patienten
results[(y_test == 1) == (y_pred_reg == 0)]
Out[30]:
y_test y_pred y_proba
42 1 0 0.320308
101 1 0 0.295621
139 1 0 0.222622
228 0 1 0.580420
254 0 1 0.671738
281 0 1 0.579086
283 0 1 0.623294
In [31]:
results[(y_test == 1) & (y_pred_reg == 0)]
Out[31]:
y_test y_pred y_proba
42 1 0 0.320308
101 1 0 0.295621
139 1 0 0.222622
In [32]:
df.iloc[[42, 101, 139], :]
Out[32]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
42 45 1 0 104 208 0 0 148 1 3.0 1 0 2 1
101 59 1 3 178 270 0 0 145 0 4.2 0 0 3 1
139 64 1 0 128 263 0 1 105 1 0.2 1 1 3 1
In [33]:
# Median der Werte über 0,5
results.loc[results.y_proba > 0.5].median()
Out[33]:
y_test     1.000000
y_pred     1.000000
y_proba    0.634712
dtype: float64
In [34]:
results.loc[results.y_proba > 0.5].min()
Out[34]:
y_test     0.000000
y_pred     1.000000
y_proba    0.506458
dtype: float64
In [35]:
# Es gibt keine Werte bei 0.5, daher ist das kde Plot unten irgendwie irreführend. 
In [36]:
figsize(10, 8)
sns.kdeplot(results.y_proba,  shade = True)
plt.axvline(0.5, 0, 1, color='red')
Out[36]:
<matplotlib.lines.Line2D at 0xdbb44a8>
In [37]:
figsize(10, 8)
plt.hist(results.y_proba, bins = 35)
plt.axvline(0.5, 0, 1, color='red')
Out[37]:
<matplotlib.lines.Line2D at 0xdecbe48>
In [38]:
# Recall berechnen
tpr = cm[1,1]/(cm[1,1]+cm[1,0])
tpr
Out[38]:
0.90625
In [39]:
fpr= cm[0,1]/(cm[0,1]+cm[0,0])
fpr
Out[39]:
0.13793103448275862
In [40]:
y_pred_prob = lr_reg.predict_proba(X_test)[:,1]
In [41]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, drop_intermediate=False)
In [42]:
thresholds
Out[42]:
array([1.82761476, 0.82761476, 0.81561405, 0.77176793, 0.76012406,
       0.75441072, 0.74483677, 0.72700317, 0.71953579, 0.70609397,
       0.6924657 , 0.69191742, 0.69102066, 0.68246915, 0.6821203 ,
       0.67215224, 0.67173783, 0.63471191, 0.62922645, 0.62329438,
       0.61512563, 0.61164029, 0.60441336, 0.60192246, 0.58041964,
       0.57908619, 0.57658788, 0.57426729, 0.5564351 , 0.55032692,
       0.54895029, 0.53400122, 0.51931432, 0.50645842, 0.4426297 ,
       0.34051001, 0.3231265 , 0.32030757, 0.31697003, 0.31344301,
       0.31325635, 0.31064888, 0.29562134, 0.29010399, 0.26999105,
       0.25256104, 0.23956276, 0.23577731, 0.23201928, 0.22262163,
       0.2118171 , 0.20131673, 0.19987921, 0.18656378, 0.17940912,
       0.17239143, 0.16734111, 0.15850851, 0.1426913 , 0.13795365,
       0.10795221, 0.10593281])

Es scheint verwunderlich, dass der erste Werte von Thresholds über 1 liegt. Dies ist kein Bug, sondern soll so sein, damit der Wert 0 für TPR und FPR angezeigt werden kann.

In [43]:
figsize(10, 7)
plt.plot(fpr, tpr)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title('ROC Curve Heart Disease')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
In [44]:
fpr
Out[44]:
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.03448276, 0.03448276, 0.03448276, 0.06896552,
       0.06896552, 0.06896552, 0.06896552, 0.06896552, 0.10344828,
       0.13793103, 0.13793103, 0.13793103, 0.13793103, 0.13793103,
       0.13793103, 0.13793103, 0.13793103, 0.13793103, 0.17241379,
       0.20689655, 0.24137931, 0.24137931, 0.27586207, 0.31034483,
       0.34482759, 0.37931034, 0.37931034, 0.4137931 , 0.44827586,
       0.48275862, 0.51724138, 0.55172414, 0.5862069 , 0.5862069 ,
       0.62068966, 0.65517241, 0.68965517, 0.72413793, 0.75862069,
       0.79310345, 0.82758621, 0.86206897, 0.89655172, 0.93103448,
       0.96551724, 1.        ])
In [45]:
plt.subplots(figsize=(15, 8))
plt.plot(fpr, tpr, 'o-', label="ROC curve")
plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), label="diagonal")

for x, y, txt in zip(fpr[::5], tpr[::5], thresholds[::5]):
    plt.annotate(np.round(txt,2), (x, y-0.04))
rnd_idx = 27
plt.annotate( 'this point refers to the tpr and the fpr\n at a probability threshold of {}'.format(np.round(thresholds[rnd_idx], 2)),
             xy=(fpr[rnd_idx], tpr[rnd_idx]), xytext=(fpr[rnd_idx]+0.2, tpr[rnd_idx]-0.25),
             arrowprops=dict(facecolor='black', lw=2, arrowstyle='->'),)

plt.legend(loc="upper left")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
Out[45]:
Text(0, 0.5, 'True Positive Rate')
In [46]:
print(tuple(zip(fpr[::5], tpr[::5], thresholds[::5])))
((0.0, 0.0, 1.8276147559473768), (0.0, 0.15625, 0.7544107230701607), (0.0, 0.3125, 0.6924657001385295), (0.0, 0.46875, 0.6721522398983056), (0.06896551724137931, 0.5625, 0.615125634698659), (0.13793103448275862, 0.65625, 0.5790861931748899), (0.13793103448275862, 0.8125, 0.5489502854374858), (0.20689655172413793, 0.90625, 0.34051000985373364), (0.3448275862068966, 0.9375, 0.3132563526280901), (0.4827586206896552, 0.96875, 0.25256103676597336), (0.6206896551724138, 1.0, 0.21181709901921705), (0.7931034482758621, 1.0, 0.1723914336750182), (0.9655172413793104, 1.0, 0.10795221374734966))

ROC Curve mit Plotly

In [47]:
trace1 = go.Scatter(x=fpr, y=tpr, 
                    mode='lines', 
                    line=dict(color='darkorange'),
                    name='ROC curve'
                   )
trace2 = go.Scatter(x=[-0.05, 1.05], y=[-0.05, 1.05], 
                    mode='lines', 
                    line=dict(color='navy',  dash='dash'),
                    showlegend=False)

layout = go.Layout(title='Receiver operating characteristic Heart Disease',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))

fig = go.Figure(data=[trace1, trace2], layout=layout)
In [48]:
py.iplot(fig)

Test mit class_weight = 'balanced'

In [49]:
lr_bal = LogisticRegression(C=0.01, class_weight='balanced')
lr_bal.fit(X_train, y_train)
Out[49]:
LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [50]:
y_pred_bal = lr_bal.predict(X_test)
In [51]:
accuracy_score(y_pred_bal, y_test)
Out[51]:
0.8688524590163934

Logistic Regression mit C = 0.01 hat die beste Accuracy von 0.885

Model aus Datensatz mit Polynomial und Interaction Features

In [52]:
df_poly = pd.read_csv('df_poly.csv')
df_poly.head()
Out[52]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target 1 cp^2 cp thalach cp ca cp oldpeak cp thal cp age thalach^2 thalach ca thalach oldpeak thalach thal ... thalach oldpeak thal thalach oldpeak age thalach thal^2 thalach thal age thalach age^2 ca^3 ca^2 oldpeak ca^2 thal ca^2 age ca oldpeak^2 ca oldpeak thal ca oldpeak age ca thal^2 ca thal age ca age^2 oldpeak^3 oldpeak^2 thal oldpeak^2 age oldpeak thal^2 oldpeak thal age oldpeak age^2 thal^3 thal^2 age thal age^2 age^3
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1 1.0 9.0 450.0 0.0 6.9 3.0 189.0 22500.0 0.0 345.0 150.0 ... 345.0 21735.0 150.0 9450.0 595350.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.167 5.29 333.27 2.3 144.9 9128.7 1.0 63.0 3969.0 250047.0
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1 1.0 4.0 374.0 0.0 7.0 4.0 74.0 34969.0 0.0 654.5 374.0 ... 1309.0 24216.5 748.0 13838.0 256003.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 42.875 24.50 453.25 14.0 259.0 4791.5 8.0 148.0 2738.0 50653.0
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1 1.0 1.0 172.0 0.0 1.4 2.0 41.0 29584.0 0.0 240.8 344.0 ... 481.6 9872.8 688.0 14104.0 289132.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.744 3.92 80.36 5.6 114.8 2353.4 8.0 164.0 3362.0 68921.0
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1 1.0 1.0 178.0 0.0 0.8 2.0 56.0 31684.0 0.0 142.4 356.0 ... 284.8 7974.4 712.0 19936.0 558208.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.512 1.28 35.84 3.2 89.6 2508.8 8.0 224.0 6272.0 175616.0
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 26569.0 0.0 97.8 326.0 ... 195.6 5574.6 652.0 18582.0 529587.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.216 0.72 20.52 2.4 68.4 1949.4 8.0 228.0 6498.0 185193.0

5 rows × 92 columns

In [53]:
x_p = df_poly.drop('target', axis=1)
y = df_poly.target
x_p.shape, y.shape
Out[53]:
((303, 91), (303,))
In [54]:
X_train_p, X_test_p, y_train, y_test = train_test_split(x_p, y, test_size=0.2, random_state=42)
X_train_p.shape, X_test_p.shape, y_train.shape, y_test.shape
Out[54]:
((242, 91), (61, 91), (242,), (61,))

Standardisieren des Polynomial Datensatzes

In [55]:
sc.fit(X_train_p)
X_train_p =sc.transform(X_train_p);
X_test_p =sc.transform(X_test_p);
In [56]:
lr_reg_p = LogisticRegression(C=0.01)
lr_reg_p.fit(X_train_p, y_train)
Out[56]:
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [57]:
y_p_pred = lr_reg_p.predict(X_test_p)
In [58]:
accuracy_score(y_p_pred, y_test)
Out[58]:
0.8852459016393442

Mit Polynomial Features performt das Model auf dem Test-Set gleich.

Verschiedene Thresholds testen, um optimalen Recall zu finden

Bei der Auswahl einer Metrik für die Güte eines Models ist die Problemstellung von großer Bedeutung. Soll vorhergesagt werden, ob ein Patient krank oder gesund ist, ist die Metrik Recall wichtiger als der Accuracy-Score. Der Grund hierfür ist, dass lieber einige gesunde Patienten weiter getestet werden, als das ein kranker Patient als gesund eingestuft wird.

Zudem sollen nun nicht die Klassen direkt vorhergesagt werden, sondern die Wahrscheinlichkeit für die Zugehörigkeit zu einer Klasse. So können Risikogruppen identifiziert werden.

In [59]:
# Dataframe aus den Vorhersagen in Wahrscheinlichkeiten
pred_proba_df = pd.DataFrame(lr_reg.predict_proba(X_test))

# Liste von Entscheidungsgrenzen
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
tpr_list = []
fpr_list = []

for i in threshold_list:
    print ('\n****** For threshold = {} ******'.format(i))
    y_test_pred = pred_proba_df.applymap(lambda x: 1 if x > i else 0)
    
    # y_test und Spalte für 1 aus y_pred umwandeln, Accuracy Score aller Werte i berechnen
    test_accuracy = accuracy_score(y_test.as_matrix().reshape(y_test.as_matrix().size, 1),
                                   y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size, 1))
    print('Testing Accuracy: {}'.format(test_accuracy))
    
    # Confusion Matrix für alle Werte i
    con = confusion_matrix(y_test.as_matrix().reshape(y_test.as_matrix().size, 1),
                           y_test_pred.iloc[:,1].as_matrix().reshape(y_test_pred.iloc[:,1].as_matrix().size, 1))
   
    tpr = con[1,1]/(con[1,1]+con[1,0])
    fpr = con[0,1]/(con[0,1]+con[0,0])
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print('Confusion Matrix: ')
    print(con)
    
    print('True Positve Rate: ', tpr)
    print('False Positive Rate: ', fpr)

    
****** For threshold = 0.05 ******
Testing Accuracy: 0.5245901639344263
Confusion Matrix: 
[[ 0 29]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  1.0

****** For threshold = 0.1 ******
Testing Accuracy: 0.5245901639344263
Confusion Matrix: 
[[ 0 29]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  1.0

****** For threshold = 0.15 ******
Testing Accuracy: 0.5901639344262295
Confusion Matrix: 
[[ 4 25]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  0.8620689655172413

****** For threshold = 0.2 ******
Testing Accuracy: 0.6885245901639344
Confusion Matrix: 
[[10 19]
 [ 0 32]]
True Positve Rate:  1.0
False Positive Rate:  0.6551724137931034

****** For threshold = 0.25 ******
Testing Accuracy: 0.7540983606557377
Confusion Matrix: 
[[15 14]
 [ 1 31]]
True Positve Rate:  0.96875
False Positive Rate:  0.4827586206896552

****** For threshold = 0.3 ******
Testing Accuracy: 0.7868852459016393
Confusion Matrix: 
[[18 11]
 [ 2 30]]
True Positve Rate:  0.9375
False Positive Rate:  0.3793103448275862

****** For threshold = 0.35 ******
Testing Accuracy: 0.8688524590163934
Confusion Matrix: 
[[24  5]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.1724137931034483

****** For threshold = 0.4 ******
Testing Accuracy: 0.8688524590163934
Confusion Matrix: 
[[24  5]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.1724137931034483

****** For threshold = 0.45 ******
Testing Accuracy: 0.8852459016393442
Confusion Matrix: 
[[25  4]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.13793103448275862

****** For threshold = 0.5 ******
Testing Accuracy: 0.8852459016393442
Confusion Matrix: 
[[25  4]
 [ 3 29]]
True Positve Rate:  0.90625
False Positive Rate:  0.13793103448275862

****** For threshold = 0.55 ******
Testing Accuracy: 0.819672131147541
Confusion Matrix: 
[[25  4]
 [ 7 25]]
True Positve Rate:  0.78125
False Positive Rate:  0.13793103448275862

****** For threshold = 0.6 ******
Testing Accuracy: 0.7868852459016393
Confusion Matrix: 
[[27  2]
 [11 21]]
True Positve Rate:  0.65625
False Positive Rate:  0.06896551724137931

****** For threshold = 0.65 ******
Testing Accuracy: 0.7049180327868853
Confusion Matrix: 
[[28  1]
 [17 15]]
True Positve Rate:  0.46875
False Positive Rate:  0.034482758620689655

****** For threshold = 0.7 ******
Testing Accuracy: 0.6229508196721312
Confusion Matrix: 
[[29  0]
 [23  9]]
True Positve Rate:  0.28125
False Positive Rate:  0.0

****** For threshold = 0.75 ******
Testing Accuracy: 0.5573770491803278
Confusion Matrix: 
[[29  0]
 [27  5]]
True Positve Rate:  0.15625
False Positive Rate:  0.0

****** For threshold = 0.8 ******
Testing Accuracy: 0.5081967213114754
Confusion Matrix: 
[[29  0]
 [30  2]]
True Positve Rate:  0.0625
False Positive Rate:  0.0

****** For threshold = 0.85 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

****** For threshold = 0.9 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

****** For threshold = 0.95 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0

****** For threshold = 0.99 ******
Testing Accuracy: 0.47540983606557374
Confusion Matrix: 
[[29  0]
 [32  0]]
True Positve Rate:  0.0
False Positive Rate:  0.0
In [60]:
rev = [i for i in threshold_list[::-1]]
rev
Out[60]:
[0.99,
 0.95,
 0.9,
 0.85,
 0.8,
 0.75,
 0.7,
 0.65,
 0.6,
 0.55,
 0.5,
 0.45,
 0.4,
 0.35,
 0.3,
 0.25,
 0.2,
 0.15,
 0.1,
 0.05]
In [61]:
[i for i in reversed(threshold_list)]
Out[61]:
[0.99,
 0.95,
 0.9,
 0.85,
 0.8,
 0.75,
 0.7,
 0.65,
 0.6,
 0.55,
 0.5,
 0.45,
 0.4,
 0.35,
 0.3,
 0.25,
 0.2,
 0.15,
 0.1,
 0.05]
In [62]:
rates = pd.DataFrame({'Threshold' : threshold_list,
                     'True Positive Rate': tpr_list,
                     'False Positive Rate': fpr_list})
In [63]:
rates
Out[63]:
Threshold True Positive Rate False Positive Rate
0 0.05 1.00000 1.000000
1 0.10 1.00000 1.000000
2 0.15 1.00000 0.862069
3 0.20 1.00000 0.655172
4 0.25 0.96875 0.482759
5 0.30 0.93750 0.379310
6 0.35 0.90625 0.172414
7 0.40 0.90625 0.172414
8 0.45 0.90625 0.137931
9 0.50 0.90625 0.137931
10 0.55 0.78125 0.137931
11 0.60 0.65625 0.068966
12 0.65 0.46875 0.034483
13 0.70 0.28125 0.000000
14 0.75 0.15625 0.000000
15 0.80 0.06250 0.000000
16 0.85 0.00000 0.000000
17 0.90 0.00000 0.000000
18 0.95 0.00000 0.000000
19 0.99 0.00000 0.000000
In [64]:
rate = rates.drop([0, 1, 6, 7, 17, 18, 19])
In [65]:
rate = rate.reset_index(drop=True)
In [66]:
rate
Out[66]:
Threshold True Positive Rate False Positive Rate
0 0.15 1.00000 0.862069
1 0.20 1.00000 0.655172
2 0.25 0.96875 0.482759
3 0.30 0.93750 0.379310
4 0.45 0.90625 0.137931
5 0.50 0.90625 0.137931
6 0.55 0.78125 0.137931
7 0.60 0.65625 0.068966
8 0.65 0.46875 0.034483
9 0.70 0.28125 0.000000
10 0.75 0.15625 0.000000
11 0.80 0.06250 0.000000
12 0.85 0.00000 0.000000
In [67]:
tpr = rate['True Positive Rate']
fpr = rate['False Positive Rate']
th = rate['Threshold']
figsize(15, 15)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.scatter(fpr, tpr)

for i, txt in enumerate(th):
   plt.annotate(txt, (fpr[i]+0.01, tpr[i]))
In [ ]: