import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.set_option('display.max_columns', None)
os.chdir('D:\Data\Projects\Klassifikation\mushrooms')
%matplotlib inline
plt.style.use('Solarize_Light2')

print('Pandas Version:', pd.__version__)
print('Numpy Version:', np.__version__)
print('Seaborn Version:', sns.__version__)

Pandas Version: 0.24.2
Numpy Version: 1.16.4
Seaborn Version: 0.9.0

df = pd.read_csv('mushrooms.csv')
df.head()

df.shape

(8124, 23)

Umbenennen von Spalten¶

df.columns = df.columns.str.replace('-','_')

df = df.rename(columns={'class':'target'})

Umbenennen einiger Kategorien für die Visualisierung¶

df.target = df.target.map({'p':'poisonous', 'e':'edible'})

df.bruises = df.bruises.map({'t':'bruises', 'f':'no bruises'})

df.cap_shape = df.cap_shape.map({'c':'Conical', 's':'Sunken', 'b':'Bell', 'k':'Knobbed', 'f':'Flat', 'x':'Convex'})

df.gill_attachment = df.gill_attachment.map({'a': 'attached', 'f': 'free'})

df.gill_color = df.gill_color.map({'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'g':'gray',
                                    'r':'green', 'o': 'orange', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 'y':'yellow'})

df.ring_type = df.ring_type.map({'c':'cobwebby', 'e':'evanescent', 'f':'flaring', 'l':'large', 'n':'none', 'p':'pendant',
                                's':'sheating', 'z':'zone'})

Distribution der Zielklasse¶

df.target.value_counts()

edible       4208
poisonous    3916
Name: target, dtype: int64

plt.figure(figsize=(8,8))
plt.rcParams['font.size'] = 15
labels = df.target.unique().tolist();
sizes = df.target.value_counts().tolist();
plt.pie(sizes, labels=labels, startangle=90, autopct='%1.1f%%');
plt.title('Poisenous and Edible Mushrooms');

Keine Class Imbalance; die beiden Klassen 'poisonous' und 'edible' sind ausgeglichen.

Fehlende Werte¶

df.isna().sum()

target                      0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64

Es gibt keine NaNs im Datensatz. Allerdings können fehlende Werte auch als strings vorkommen.

Datentypen im Set¶

df.dtypes.unique()

array([dtype('O')], dtype=object)

Distribution der Hutformen¶

df.cap_shape.value_counts().sort_values().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0xb96c2b0>

vc = df.cap_shape.value_counts().sort_values()

plt.figure(figsize=(12,8))
plt.title("Cap Shapes", fontsize = 30)
plt.rcParams['font.size'] = 20
ax = sns.barplot(y= vc.index, x= vc.values);
ax.set_xlabel('Count', fontsize=20)
ax.set_ylabel('Category', fontsize=20)

Text(0, 0.5, 'Category')

Definition für das Plotten der Variablen¶

def figure(col):
    vc = df[col].value_counts().sort_values(ascending=False)
    
    plt.figure(figsize=(12,8))
    plt.rcParams['font.size'] = 20
    plt.title(col, fontsize = 30)
    ax = sns.barplot(y= vc.index, x= vc.values);
    ax.set_xlabel('Count', fontsize=20)
    ax.set_ylabel('Category', fontsize=20)

figure('cap_shape')

Distribution von Stielen¶

Anzahl der Kategorien in den Spalten, die den Stiel eines Pilzes beschreiben

df[['stalk_shape','stalk_root','stalk_surface_above_ring','stalk_surface_below_ring','stalk_color_above_ring', 'stalk_color_below_ring']].nunique().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0xc1c0160>

Zusammenhang verschiedener Variablen und der Giftigkeit¶

Wurzeln¶

plt.figure(figsize=(12,8))
plt.title("Stalk Root", fontsize = 30)

ax = sns.countplot(x=df.target, hue=df.stalk_root, data=df, palette=["orange", 'green', 'blue', 'red', 'pink'])
ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_xticklabels(['poisenous', 'edible'], rotation='horizontal', fontsize=15);
plt.legend( );
ax.legend(['Equal', 'Club', 'Bulbous','Rooted','?'], fancybox=True, framealpha=1, shadow=True, borderpad=1);

Bei einer großen Anzahl von Pilzen ist 'Stalk Root' unbekannt.

Habitat¶

plt.figure(figsize=(15,10))
plt.title("Habitat", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.habitat)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_xticklabels(['poisonous', 'edible'], rotation='horizontal', fontsize=15);
ax.legend(['urban', 'grasses', 'meadows', 'woods', 'paths', 'waste', 'leaves'], 
          fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

Odor¶

plt.figure(figsize=(15,10))
plt.title("Odor", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.odor)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_xticklabels(['poisonous', 'edible'], rotation='horizontal', fontsize=15);
ax.legend(['pungent', 'almond', 'anise','none','foul',  'creosote','fishy','spicy', 'musty'],
          fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

Der Geruch könnte ein guter Hinweis auf die Giftigkeit eines Pilzes sein

Bruises¶

plt.figure(figsize=(15,10))
plt.title("Bruises", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.bruises)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

plt.figure(figsize=(15,10))
plt.title("Gill Attachment", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.gill_attachment)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

plt.figure(figsize=(15,10))
plt.title("Gill Color", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.gill_color)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

plt.figure(figsize=(15,10))
plt.title("Ring Type", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.ring_type)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

Umwandeln der kategorischen Variablen¶

Eine der Kategorien ist jeweils die Baseline und wird entfernt, um Kollinearität zu vermeiden. Der Einfachheit halber wird die abhängige Variable 'target_poisonous' in target umbenannt.

df_ = pd.get_dummies(df, drop_first = True).rename(columns={'target_poisonous':'target'})

df_.shape, df.shape

((8124, 96), (8124, 23))

df_.head()

Korrelationen mit der abhängigen Variablen¶

corr = df_.corr()['target'].sort_values(ascending=False)

Die wichtigsten 20 Variablen, die am stärksten mit dem Ziel korreliert sind.

corr20 = abs(corr).head(20)

list(corr20.index)

['target',
 'odor_f',
 'stalk_surface_above_ring_k',
 'stalk_surface_below_ring_k',
 'gill_size_n',
 'gill_color_buff',
 'bruises_no bruises',
 'spore_print_color_h',
 'ring_type_large',
 'population_v',
 'spore_print_color_w',
 'habitat_p',
 'odor_y',
 'odor_s',
 'stalk_color_above_ring_n',
 'stalk_color_below_ring_p',
 'stalk_color_above_ring_p',
 'stalk_color_below_ring_n',
 'odor_p',
 'ring_number_o']

DataFrame speichern¶

#df_.to_csv('df_clean.csv')

Dimensionality Reduction¶

#df_if = df_[list(corr20.index)]
#df_if.to_csv('df_if.csv')

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	e	e	s	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	e	c	s	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	e	c	s	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	e	e	s	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	t	e	s	s	w	w	p	w	o	e	n	a	g

	target	cap_shape_Convex	cap_surface_s	cap_surface_y	cap_color_g	cap_color_n	cap_color_w	cap_color_y	bruises_no bruises	odor_l	odor_n	odor_p	gill_attachment_free	gill_spacing_w	gill_size_n	gill_color_brown	stalk_shape_t	stalk_root_c	stalk_root_e	stalk_surface_above_ring_s	stalk_surface_below_ring_s	stalk_color_above_ring_w	stalk_color_below_ring_w	veil_color_w	ring_number_o	ring_type_pendant	spore_print_color_k	spore_print_color_n	population_n	population_s	habitat_g	habitat_m	habitat_u
0	1	1	1	0	0	1	0	0	0	0	0	1	1	0	1	0	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
1	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	1	1	1	1	1	1	1	0	1	1	0	1	0	0
2	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	1	0	1	1	1	1	1	1	1	0	1	1	0	0	1	0
3	1	1	0	1	0	0	1	0	0	0	0	1	1	0	1	1	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
4	0	1	1	0	1	0	0	0	1	0	1	0	1	1	0	0	1	0	1	1	1	1	1	1	1	0	0	1	0	0	1	0	0

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	e	e	s	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	e	c	s	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	e	c	s	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	e	e	s	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	t	e	s	s	w	w	p	w	o	e	n	a	g

	target	cap_shape_Convex	cap_surface_s	cap_surface_y	cap_color_g	cap_color_n	cap_color_w	cap_color_y	bruises_no bruises	odor_l	odor_n	odor_p	gill_attachment_free	gill_spacing_w	gill_size_n	gill_color_brown	stalk_shape_t	stalk_root_c	stalk_root_e	stalk_surface_above_ring_s	stalk_surface_below_ring_s	stalk_color_above_ring_w	stalk_color_below_ring_w	veil_color_w	ring_number_o	ring_type_pendant	spore_print_color_k	spore_print_color_n	population_n	population_s	habitat_g	habitat_m	habitat_u
0	1	1	1	0	0	1	0	0	0	0	0	1	1	0	1	0	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
1	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	1	1	1	1	1	1	1	0	1	1	0	1	0	0
2	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	1	0	1	1	1	1	1	1	1	0	1	1	0	0	1	0
3	1	1	0	1	0	0	1	0	0	0	0	1	1	0	1	1	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
4	0	1	1	0	1	0	0	0	1	0	1	0	1	1	0	0	1	0	1	1	1	1	1	1	1	0	0	1	0	0	1	0	0

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	e	e	s	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	e	c	s	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	e	c	s	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	e	e	s	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	t	e	s	s	w	w	p	w	o	e	n	a	g

	target	cap_shape_Convex	cap_surface_s	cap_surface_y	cap_color_g	cap_color_n	cap_color_w	cap_color_y	bruises_no bruises	odor_l	odor_n	odor_p	gill_attachment_free	gill_spacing_w	gill_size_n	gill_color_brown	stalk_shape_t	stalk_root_c	stalk_root_e	stalk_surface_above_ring_s	stalk_surface_below_ring_s	stalk_color_above_ring_w	stalk_color_below_ring_w	veil_color_w	ring_number_o	ring_type_pendant	spore_print_color_k	spore_print_color_n	population_n	population_s	habitat_g	habitat_m	habitat_u
0	1	1	1	0	0	1	0	0	0	0	0	1	1	0	1	0	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
1	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	1	1	1	1	1	1	1	0	1	1	0	1	0	0
2	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	1	0	1	1	1	1	1	1	1	0	1	1	0	0	1	0
3	1	1	0	1	0	0	1	0	0	0	0	1	1	0	1	1	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
4	0	1	1	0	1	0	0	0	1	0	1	0	1	1	0	0	1	0	1	1	1	1	1	1	1	0	0	1	0	0	1	0	0

	class	cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat
0	p	x	s	n	t	p	f	c	n	k	e	e	s	s	w	w	p	w	o	p	k	s	u
1	e	x	s	y	t	a	f	c	b	k	e	c	s	s	w	w	p	w	o	p	n	n	g
2	e	b	s	w	t	l	f	c	b	n	e	c	s	s	w	w	p	w	o	p	n	n	m
3	p	x	y	w	t	p	f	c	n	n	e	e	s	s	w	w	p	w	o	p	k	s	u
4	e	x	s	g	f	n	f	w	b	k	t	e	s	s	w	w	p	w	o	e	n	a	g

	target	cap_shape_Convex	cap_surface_s	cap_surface_y	cap_color_g	cap_color_n	cap_color_w	cap_color_y	bruises_no bruises	odor_l	odor_n	odor_p	gill_attachment_free	gill_spacing_w	gill_size_n	gill_color_brown	stalk_shape_t	stalk_root_c	stalk_root_e	stalk_surface_above_ring_s	stalk_surface_below_ring_s	stalk_color_above_ring_w	stalk_color_below_ring_w	veil_color_w	ring_number_o	ring_type_pendant	spore_print_color_k	spore_print_color_n	population_n	population_s	habitat_g	habitat_m	habitat_u
0	1	1	1	0	0	1	0	0	0	0	0	1	1	0	1	0	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
1	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	1	1	1	1	1	1	1	0	1	1	0	1	0	0
2	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	1	0	1	1	1	1	1	1	1	0	1	1	0	0	1	0
3	1	1	0	1	0	0	1	0	0	0	0	1	1	0	1	1	0	0	1	1	1	1	1	1	1	1	1	0	0	1	0	0	1
4	0	1	1	0	1	0	0	0	1	0	1	0	1	1	0	0	1	0	1	1	1	1	1	1	1	0	0	1	0	0	1	0	0