In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Heart Disease')

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

Einlesen

In [2]:
df = pd.read_csv('heart.csv')
df.head()
Out[2]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
0 63 1 3 145 233 1 0 150 0 2.300 0 0 1 1
1 37 1 2 130 250 0 1 187 0 3.500 0 0 2 1
2 41 0 1 130 204 0 0 172 0 1.400 2 0 2 1
3 56 1 1 120 236 0 1 178 0 0.800 2 0 2 1
4 57 0 0 120 354 0 1 163 1 0.600 2 0 2 1

Datentypen der Variablen

In [3]:
df.dtypes
Out[3]:
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

Fehlende Werte

In [4]:
missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing = missing.sort_values('percent', ascending = False)
missing
Out[4]:
total percent

Statistischer Überblick

In [5]:
df.describe()
Out[5]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
count 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000 303.000
mean 54.366 0.683 0.967 131.624 246.264 0.149 0.528 149.647 0.327 1.040 1.399 0.729 2.314 0.545
std 9.082 0.466 1.032 17.538 51.831 0.356 0.526 22.905 0.470 1.161 0.616 1.023 0.612 0.499
min 29.000 0.000 0.000 94.000 126.000 0.000 0.000 71.000 0.000 0.000 0.000 0.000 0.000 0.000
25% 47.500 0.000 0.000 120.000 211.000 0.000 0.000 133.500 0.000 0.000 1.000 0.000 2.000 0.000
50% 55.000 1.000 1.000 130.000 240.000 0.000 1.000 153.000 0.000 0.800 1.000 0.000 2.000 1.000
75% 61.000 1.000 2.000 140.000 274.500 0.000 1.000 166.000 1.000 1.600 2.000 1.000 3.000 1.000
max 77.000 1.000 3.000 200.000 564.000 1.000 2.000 202.000 1.000 6.200 2.000 4.000 3.000 1.000

Visualisierung der Daten

In [6]:
# Visualisierung mit plotly
target= df['target'].value_counts()
levels = ['1','0']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','blue')))
layout = dict(title="Anteile Herzkrank und Gesund", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)
1    165
0    138
Name: target, dtype: int64
In [7]:
# Altersgruppen im Datensatz
df['age_bin'] = pd.cut(df['age'], bins = list(range(29, 77, 10))).astype(str)
df.loc[df['age_bin'] == 'nan', 'age_bin'] = '[69-77]'
In [33]:
df['age_bin'].value_counts().sort_index().plot.bar(color = 'b', edgecolor = 'k');
In [32]:
sns.distplot(df.age, kde = True)
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x149259e8>
In [10]:
df.columns
Out[10]:
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'age_bin'],
      dtype='object')
In [11]:
g=sns.catplot(x="sex", kind="count", palette="Set2", data=df, hue='target');
g.set_xticklabels(['Female', 'Male']);

Es ist eigentlich interessanter, wie viele von den Frauen und wie viele von den Männern krank sind in %, als die absoluten Werte.

In [12]:
al= df.groupby('sex')['target'].mean().reset_index()
al.target = al.target*100
al.sex = al.sex.replace({0:'male', 1:'female'})
al
Out[12]:
sex target
0 male 75.000
1 female 44.928
In [27]:
px.bar(al, x='sex', y='target', color='sex', title='Heart Disease in Males and Females in %', width=500, height=600)
In [14]:
figsize(15, 10)
for i in ['cp', 'fbs', 'restecg','exang', 'slope', 'ca', 'thal']:
       g=sns.catplot(x=i, kind="count", palette="Set2", data=df, hue='target');  
In [30]:
px.histogram(df, x='cp', color = 'target', barmode="group",  
             template= 'plotly_dark', title='Chest Pain Value', width=700, height=500)
In [31]:
px.histogram(df, x='exang', color = 'target', barmode="group",  template= 'plotly_dark', 
             title='Exercise Induced Angina', width=700, height=500)
In [17]:
# Zusammenhang Alter und HeartDisease
# Visualisierungen mit Seaborn: KDE (Kerndichteschätzer)
def kdeplot(feature):
    plt.figure(figsize=(10, 5))
    plt.title("Kernel Densitiy Estimate for {}".format(feature.capitalize()))
    ax0 = sns.kdeplot(df[df['target'] == 0][feature].dropna(), color= 'navy', label= 'No Heart Disease')
    ax1 = sns.kdeplot(df[df['target'] == 1][feature].dropna(), color= 'orange', label= 'Heart Disease')


for i in ['age', 'trestbps', 'chol',   'thalach', 'oldpeak' ]:
    kdeplot(i)

Cluster Analyse

In [18]:
from sklearn.cluster import KMeans
In [19]:
clu = df[['age', 'oldpeak']][df.target == 0]
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(clu)
clu['label'] = kmeans.labels_
In [20]:
figsize(10, 10)
plt.scatter(clu['age'], clu['oldpeak'], c=clu['label'], cmap='Accent')
plt.xlabel('Age (Years)')
plt.ylabel('Oldpeak (was)')
plt.title('3 Clusters of Sick People')
plt.show()

Korrelation mit dem Target

In [21]:
corr_tar = df.drop(["target", 'age_bin'], axis=1).apply(lambda x: x.corr(df.target)).sort_values()
corr_tar
Out[21]:
exang      -0.437
oldpeak    -0.431
ca         -0.392
thal       -0.344
sex        -0.281
age        -0.225
trestbps   -0.145
chol       -0.085
fbs        -0.028
restecg     0.137
slope       0.346
thalach     0.422
cp          0.434
dtype: float64
In [22]:
# Absolute Werte zum einfacheren Vergleich
ct= abs(corr_tar).sort_values(ascending=False)
ct
Out[22]:
exang      0.437
cp         0.434
oldpeak    0.431
thalach    0.422
ca         0.392
slope      0.346
thal       0.344
sex        0.281
age        0.225
trestbps   0.145
restecg    0.137
chol       0.085
fbs        0.028
dtype: float64
In [23]:
ct.plot.bar();

Korrelation aller Variablen

In [24]:
corrs = df.corr()
plt.figure(figsize = (15, 15))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f', cmap='viridis');