import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Heart Disease')

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

Einlesen¶

df = pd.read_csv('heart.csv')
df.head()

Datentypen der Variablen¶

df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

Fehlende Werte¶

missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing = missing.sort_values('percent', ascending = False)
missing

Statistischer Überblick¶

df.describe()

Visualisierung der Daten¶

# Visualisierung mit plotly
target= df['target'].value_counts()
levels = ['1','0']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','blue')))
layout = dict(title="Anteile Herzkrank und Gesund", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)

1    165
0    138
Name: target, dtype: int64

# Altersgruppen im Datensatz
df['age_bin'] = pd.cut(df['age'], bins = list(range(29, 77, 10))).astype(str)
df.loc[df['age_bin'] == 'nan', 'age_bin'] = '[69-77]'

df['age_bin'].value_counts().sort_index().plot.bar(color = 'b', edgecolor = 'k');

sns.distplot(df.age, kde = True)

<matplotlib.axes._subplots.AxesSubplot at 0x149259e8>

df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'age_bin'],
      dtype='object')

g=sns.catplot(x="sex", kind="count", palette="Set2", data=df, hue='target');
g.set_xticklabels(['Female', 'Male']);

Es ist eigentlich interessanter, wie viele von den Frauen und wie viele von den Männern krank sind in %, als die absoluten Werte.

al= df.groupby('sex')['target'].mean().reset_index()
al.target = al.target*100
al.sex = al.sex.replace({0:'male', 1:'female'})
al

px.bar(al, x='sex', y='target', color='sex', title='Heart Disease in Males and Females in %', width=500, height=600)

figsize(15, 10)
for i in ['cp', 'fbs', 'restecg','exang', 'slope', 'ca', 'thal']:
       g=sns.catplot(x=i, kind="count", palette="Set2", data=df, hue='target');

px.histogram(df, x='cp', color = 'target', barmode="group",  
             template= 'plotly_dark', title='Chest Pain Value', width=700, height=500)

px.histogram(df, x='exang', color = 'target', barmode="group",  template= 'plotly_dark', 
             title='Exercise Induced Angina', width=700, height=500)

# Zusammenhang Alter und HeartDisease
# Visualisierungen mit Seaborn: KDE (Kerndichteschätzer)
def kdeplot(feature):
    plt.figure(figsize=(10, 5))
    plt.title("Kernel Densitiy Estimate for {}".format(feature.capitalize()))
    ax0 = sns.kdeplot(df[df['target'] == 0][feature].dropna(), color= 'navy', label= 'No Heart Disease')
    ax1 = sns.kdeplot(df[df['target'] == 1][feature].dropna(), color= 'orange', label= 'Heart Disease')


for i in ['age', 'trestbps', 'chol',   'thalach', 'oldpeak' ]:
    kdeplot(i)

Cluster Analyse¶

from sklearn.cluster import KMeans

clu = df[['age', 'oldpeak']][df.target == 0]
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(clu)
clu['label'] = kmeans.labels_

figsize(10, 10)
plt.scatter(clu['age'], clu['oldpeak'], c=clu['label'], cmap='Accent')
plt.xlabel('Age (Years)')
plt.ylabel('Oldpeak (was)')
plt.title('3 Clusters of Sick People')
plt.show()

Korrelation mit dem Target¶

corr_tar = df.drop(["target", 'age_bin'], axis=1).apply(lambda x: x.corr(df.target)).sort_values()
corr_tar

exang      -0.437
oldpeak    -0.431
ca         -0.392
thal       -0.344
sex        -0.281
age        -0.225
trestbps   -0.145
chol       -0.085
fbs        -0.028
restecg     0.137
slope       0.346
thalach     0.422
cp          0.434
dtype: float64

# Absolute Werte zum einfacheren Vergleich
ct= abs(corr_tar).sort_values(ascending=False)
ct

exang      0.437
cp         0.434
oldpeak    0.431
thalach    0.422
ca         0.392
slope      0.346
thal       0.344
sex        0.281
age        0.225
trestbps   0.145
restecg    0.137
chol       0.085
fbs        0.028
dtype: float64

ct.plot.bar();

Korrelation aller Variablen¶

corrs = df.corr()
plt.figure(figsize = (15, 15))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f', cmap='viridis');

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.300	0	1	1
1	37	1	2	130	250	0	1	187	0	3.500	0	2	1
2	41	0	1	130	204	0	0	172	0	1.400	2	2	1
3	56	1	1	120	236	0	1	178	0	0.800	2	2	1
4	57	0	0	120	354	0	1	163	1	0.600	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
count	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000	303.000
mean	54.366	0.683	0.967	131.624	246.264	0.149	0.528	149.647	0.327	1.040	1.399	0.729	2.314	0.545
std	9.082	0.466	1.032	17.538	51.831	0.356	0.526	22.905	0.470	1.161	0.616	1.023	0.612	0.499
min	29.000	0.000	0.000	94.000	126.000	0.000	0.000	71.000	0.000	0.000	0.000	0.000	0.000	0.000
25%	47.500	0.000	0.000	120.000	211.000	0.000	0.000	133.500	0.000	0.000	1.000	0.000	2.000	0.000
50%	55.000	1.000	1.000	130.000	240.000	0.000	1.000	153.000	0.000	0.800	1.000	0.000	2.000	1.000
75%	61.000	1.000	2.000	140.000	274.500	0.000	1.000	166.000	1.000	1.600	2.000	1.000	3.000	1.000
max	77.000	1.000	3.000	200.000	564.000	1.000	2.000	202.000	1.000	6.200	2.000	4.000	3.000	1.000

	sex	target
0	male	75.000
1	female	44.928