import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import os
os.chdir('D:\Data\Projects\Klassifikation\Heart Disease')
import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
df = pd.read_csv('heart.csv')
df.head()
df.dtypes
missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing = missing.sort_values('percent', ascending = False)
missing
df.describe()
# Visualisierung mit plotly
target= df['target'].value_counts()
levels = ['1','0']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','blue')))
layout = dict(title="Anteile Herzkrank und Gesund", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)
# Altersgruppen im Datensatz
df['age_bin'] = pd.cut(df['age'], bins = list(range(29, 77, 10))).astype(str)
df.loc[df['age_bin'] == 'nan', 'age_bin'] = '[69-77]'
df['age_bin'].value_counts().sort_index().plot.bar(color = 'b', edgecolor = 'k');
sns.distplot(df.age, kde = True)
df.columns
g=sns.catplot(x="sex", kind="count", palette="Set2", data=df, hue='target');
g.set_xticklabels(['Female', 'Male']);
Es ist eigentlich interessanter, wie viele von den Frauen und wie viele von den Männern krank sind in %, als die absoluten Werte.
al= df.groupby('sex')['target'].mean().reset_index()
al.target = al.target*100
al.sex = al.sex.replace({0:'male', 1:'female'})
al
px.bar(al, x='sex', y='target', color='sex', title='Heart Disease in Males and Females in %', width=500, height=600)
figsize(15, 10)
for i in ['cp', 'fbs', 'restecg','exang', 'slope', 'ca', 'thal']:
g=sns.catplot(x=i, kind="count", palette="Set2", data=df, hue='target');
px.histogram(df, x='cp', color = 'target', barmode="group",
template= 'plotly_dark', title='Chest Pain Value', width=700, height=500)
px.histogram(df, x='exang', color = 'target', barmode="group", template= 'plotly_dark',
title='Exercise Induced Angina', width=700, height=500)
# Zusammenhang Alter und HeartDisease
# Visualisierungen mit Seaborn: KDE (Kerndichteschätzer)
def kdeplot(feature):
plt.figure(figsize=(10, 5))
plt.title("Kernel Densitiy Estimate for {}".format(feature.capitalize()))
ax0 = sns.kdeplot(df[df['target'] == 0][feature].dropna(), color= 'navy', label= 'No Heart Disease')
ax1 = sns.kdeplot(df[df['target'] == 1][feature].dropna(), color= 'orange', label= 'Heart Disease')
for i in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak' ]:
kdeplot(i)
from sklearn.cluster import KMeans
clu = df[['age', 'oldpeak']][df.target == 0]
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(clu)
clu['label'] = kmeans.labels_
figsize(10, 10)
plt.scatter(clu['age'], clu['oldpeak'], c=clu['label'], cmap='Accent')
plt.xlabel('Age (Years)')
plt.ylabel('Oldpeak (was)')
plt.title('3 Clusters of Sick People')
plt.show()
corr_tar = df.drop(["target", 'age_bin'], axis=1).apply(lambda x: x.corr(df.target)).sort_values()
corr_tar
# Absolute Werte zum einfacheren Vergleich
ct= abs(corr_tar).sort_values(ascending=False)
ct
ct.plot.bar();
corrs = df.corr()
plt.figure(figsize = (15, 15))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f', cmap='viridis');