import pandas as pd
import numpy as np

# Visualisierungen
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('Solarize_Light2')
from IPython.core.pylabtools import figsize
figsize(10, 8)

# Interface zum System
import os
os.chdir('D:\Data\Projects\Klassifikation\Diabetes')

# Anzeige
pd.set_option('display.float_format', lambda x: '%.3f' % x)

Einlesen des Datensatzes¶

df = pd.read_csv('diabetes.csv')
print(df.shape)
df.head()

(768, 9)

Datentypen aller Features¶

df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

Fehlende Werte¶

missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing.sort_values('percent', ascending = False).head()

Statistiken aller numerischen Spalten¶

df.describe()

Bei Glucose, BloodPressure, SkinThickness, Insulin, BMI und DiabetesPedigreeFunction gibt es viele Werte, die 0 sind, die nicht sein können. Diese werden ersetzt mit dem Mittelwert des fehlenden Featues der Frauen aus der gleichen Altersgruppe.

Anzahl Werte == 0 pro Feature im Datensatz¶

df.isin(['0']).sum(axis=0)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

df.loc[df.Glucose == 0]

Nullwerte mit replace und Median ersetzen¶

df['Glucose'] = df['Glucose'].replace({0: df['Glucose'].median()})
df['BloodPressure'] = df['BloodPressure'].replace({0: df['BloodPressure'].median()})
df['SkinThickness'] = df['SkinThickness'].replace({0: df['SkinThickness'].median()})
df['Insulin'] = df['Insulin'].replace({0: df['Insulin'].median()})
df['BMI'] = df['BMI'].replace({0: df['BMI'].median()})

df.describe()

Verteilung Zielvariable¶

 ax = sns.countplot(x="Outcome", data=df, palette="Set3")

Der Datensatz ist unausgeglichen, es gibt viel mehr Frauen ohne Diabetes.
Dies muss später beim Erstellen der Train und Test Datensätze, beim Modeling und bei der Auswahl der Metrik berücksichtigt werden.

Altersverteilung¶

plt.hist(df.Age);

Verteilung Blutdruck in Abhängigkeit von Alter¶

df.groupby('Age')['BloodPressure'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0xb8dd240>

Univariate Dichtefunktion der Features mit Seaborn Kdeplot¶

df.hist(figsize=(10,8));

Boxplots der Features¶

df.plot(kind= 'box' , subplots=True, layout=(3,3), sharex=False, sharey=False, figsize=(14,14));

Density Plots der Features¶

plt.figure(figsize=(12, 12))
for i, j in enumerate(df.columns[:-1]):
    plt.subplot(4, 2, i + 1)
    sns.kdeplot(df[j], color = 'green')
    plt.title('Distribution of %s' % j)
plt.tight_layout()

Density Plots der Features in Abhängigkeit der Zielvariablen¶

# Density Plots der Features in Abhängigkeit der Zielvariablen
plt.figure(figsize=(12, 12))
# Plot the distribution of each variable colored
# by the relation to the median grade

# mit zwei Variablen durch Spalten iter 
# i zählt einfach nur mit enumerate, j greift auf Spaltentitel zu
for i, j in enumerate(df.columns[:-1]):
    
    #Gundgrid ist 3x2, füge nach und nach ein Bild hinzu
    plt.subplot(4, 2, i + 1)
    
    # alle Reihen, die ein above oder below haben, auswählen
    diabetes = df[df['Outcome'] == 1]
    no_diabetes = df[df['Outcome'] == 0]
    
    # von diesen Reihen die Spalten zeichnen
    sns.kdeplot(diabetes[j], label = 'Diabetes', color = 'red')
    sns.kdeplot(no_diabetes[j], label = 'No Diabetes', color = 'green')
    
    # für den Titel die jeweilige Spaltenüberschrift nehmen
    plt.legend(); plt.title('Distribution of %s' % j)
    
plt.tight_layout()

Univariate Verteilung der Variablen¶

import itertools

columns=df.columns[:8]
plt.subplots(figsize=(18,15))
length=len(columns)
for i,j in itertools.zip_longest(columns,range(length)):
    plt.subplot((length/2),3,j+1)
    plt.subplots_adjust(wspace=0.2,hspace=0.5)
    df[i].hist(bins=20,edgecolor='black')
    plt.title(i)
plt.show()

Korrelationen zwischen den Features¶

sns.pairplot(df, hue = 'Outcome', vars = df.columns[:8], diag_kind = 'kde');

Korrelationskoeffizienten Features und Ziel¶

df.corr()['Outcome'].sort_values(ascending=False)

Outcome                    1.000
Glucose                    0.493
BMI                        0.312
Age                        0.238
Pregnancies                0.222
SkinThickness              0.189
DiabetesPedigreeFunction   0.174
BloodPressure              0.166
Insulin                    0.148
Name: Outcome, dtype: float64

Heatmap Korrelationen¶

plt.figure(figsize=(15, 12))
sns.heatmap(df.corr(), annot = True, cmap= 'viridis');

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.000	768.000	768.000	768.000	768.000	768.000	768.000	768.000	768.000
mean	3.845	120.895	69.105	20.536	79.799	31.993	0.472	33.241	0.349
std	3.370	31.973	19.356	15.952	115.244	7.884	0.331	11.760	0.477
min	0.000	0.000	0.000	0.000	0.000	0.000	0.078	21.000	0.000
25%	1.000	99.000	62.000	0.000	0.000	27.300	0.244	24.000	0.000
50%	3.000	117.000	72.000	23.000	30.500	32.000	0.372	29.000	0.000
75%	6.000	140.250	80.000	32.000	127.250	36.600	0.626	41.000	1.000
max	17.000	199.000	122.000	99.000	846.000	67.100	2.420	81.000	1.000

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.000	768.000	768.000	768.000	768.000	768.000	768.000	768.000	768.000
mean	3.845	121.656	72.387	27.335	94.652	32.451	0.472	33.241	0.349
std	3.370	30.438	12.097	9.229	105.548	6.875	0.331	11.760	0.477
min	0.000	44.000	24.000	7.000	14.000	18.200	0.078	21.000	0.000
25%	1.000	99.750	64.000	23.000	30.500	27.500	0.244	24.000	0.000
50%	3.000	117.000	72.000	23.000	31.250	32.000	0.372	29.000	0.000
75%	6.000	140.250	80.000	32.000	127.250	36.600	0.626	41.000	1.000
max	17.000	199.000	122.000	99.000	846.000	67.100	2.420	81.000	1.000

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.600	0.627	50	1
1	1	85	66	29	0	26.600	0.351	31	0
2	8	183	64	0	0	23.300	0.672	32	1
3	1	89	66	23	94	28.100	0.167	21	0
4	0	137	40	35	168	43.100	2.288	33	1

	Pregnancies	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
75	1	48	20	0	24.700	0.140	22	0
182	1	74	20	23	27.700	0.299	21	0
342	1	68	35	0	32.000	0.389	22	0
349	5	80	32	0	41.000	0.346	37	1
502	6	68	41	0	39.000	0.727	41	1