In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.set_option('display.max_columns', None)
os.chdir('D:\Data\Projects\Klassifikation\mushrooms')
%matplotlib inline
plt.style.use('Solarize_Light2')
In [2]:
print('Pandas Version:', pd.__version__)
print('Numpy Version:', np.__version__)
print('Seaborn Version:', sns.__version__)
Pandas Version: 0.24.2
Numpy Version: 1.16.4
Seaborn Version: 0.9.0
In [3]:
df = pd.read_csv('mushrooms.csv')
df.head()
Out[3]:
class cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing gill-size gill-color stalk-shape stalk-root stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population habitat
0 p x s n t p f c n k e e s s w w p w o p k s u
1 e x s y t a f c b k e c s s w w p w o p n n g
2 e b s w t l f c b n e c s s w w p w o p n n m
3 p x y w t p f c n n e e s s w w p w o p k s u
4 e x s g f n f w b k t e s s w w p w o e n a g
In [4]:
df.shape
Out[4]:
(8124, 23)

Umbenennen von Spalten

In [5]:
df.columns = df.columns.str.replace('-','_')
In [6]:
df = df.rename(columns={'class':'target'})

Umbenennen einiger Kategorien für die Visualisierung

In [7]:
df.target = df.target.map({'p':'poisonous', 'e':'edible'})
In [8]:
df.bruises = df.bruises.map({'t':'bruises', 'f':'no bruises'})
In [9]:
df.cap_shape = df.cap_shape.map({'c':'Conical', 's':'Sunken', 'b':'Bell', 'k':'Knobbed', 'f':'Flat', 'x':'Convex'})
In [10]:
df.gill_attachment = df.gill_attachment.map({'a': 'attached', 'f': 'free'})
In [11]:
df.gill_color = df.gill_color.map({'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'g':'gray',
                                    'r':'green', 'o': 'orange', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 'y':'yellow'})
In [12]:
df.ring_type = df.ring_type.map({'c':'cobwebby', 'e':'evanescent', 'f':'flaring', 'l':'large', 'n':'none', 'p':'pendant',
                                's':'sheating', 'z':'zone'})

Distribution der Zielklasse

In [13]:
df.target.value_counts()
Out[13]:
edible       4208
poisonous    3916
Name: target, dtype: int64
In [14]:
plt.figure(figsize=(8,8))
plt.rcParams['font.size'] = 15
labels = df.target.unique().tolist();
sizes = df.target.value_counts().tolist();
plt.pie(sizes, labels=labels, startangle=90, autopct='%1.1f%%');
plt.title('Poisenous and Edible Mushrooms');

Keine Class Imbalance; die beiden Klassen 'poisonous' und 'edible' sind ausgeglichen.

Fehlende Werte

In [15]:
df.isna().sum()
Out[15]:
target                      0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64

Es gibt keine NaNs im Datensatz. Allerdings können fehlende Werte auch als strings vorkommen.

Datentypen im Set

In [16]:
df.dtypes.unique()
Out[16]:
array([dtype('O')], dtype=object)

Distribution der Hutformen

In [17]:
df.cap_shape.value_counts().sort_values().plot(kind='bar')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0xb96c2b0>
In [18]:
vc = df.cap_shape.value_counts().sort_values()
In [19]:
plt.figure(figsize=(12,8))
plt.title("Cap Shapes", fontsize = 30)
plt.rcParams['font.size'] = 20
ax = sns.barplot(y= vc.index, x= vc.values);
ax.set_xlabel('Count', fontsize=20)
ax.set_ylabel('Category', fontsize=20)
Out[19]:
Text(0, 0.5, 'Category')

Definition für das Plotten der Variablen

In [20]:
def figure(col):
    vc = df[col].value_counts().sort_values(ascending=False)
    
    plt.figure(figsize=(12,8))
    plt.rcParams['font.size'] = 20
    plt.title(col, fontsize = 30)
    ax = sns.barplot(y= vc.index, x= vc.values);
    ax.set_xlabel('Count', fontsize=20)
    ax.set_ylabel('Category', fontsize=20)
    
In [21]:
figure('cap_shape')

Distribution von Stielen

Anzahl der Kategorien in den Spalten, die den Stiel eines Pilzes beschreiben

In [22]:
df[['stalk_shape','stalk_root','stalk_surface_above_ring','stalk_surface_below_ring','stalk_color_above_ring', 'stalk_color_below_ring']].nunique().plot(kind='bar')
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0xc1c0160>

Zusammenhang verschiedener Variablen und der Giftigkeit

Wurzeln

In [23]:
plt.figure(figsize=(12,8))
plt.title("Stalk Root", fontsize = 30)

ax = sns.countplot(x=df.target, hue=df.stalk_root, data=df, palette=["orange", 'green', 'blue', 'red', 'pink'])
ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_xticklabels(['poisenous', 'edible'], rotation='horizontal', fontsize=15);
plt.legend( );
ax.legend(['Equal', 'Club', 'Bulbous','Rooted','?'], fancybox=True, framealpha=1, shadow=True, borderpad=1);

Bei einer großen Anzahl von Pilzen ist 'Stalk Root' unbekannt.

Habitat

In [24]:
plt.figure(figsize=(15,10))
plt.title("Habitat", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.habitat)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_xticklabels(['poisonous', 'edible'], rotation='horizontal', fontsize=15);
ax.legend(['urban', 'grasses', 'meadows', 'woods', 'paths', 'waste', 'leaves'], 
          fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

Odor

In [25]:
plt.figure(figsize=(15,10))
plt.title("Odor", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.odor)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_xticklabels(['poisonous', 'edible'], rotation='horizontal', fontsize=15);
ax.legend(['pungent', 'almond', 'anise','none','foul',  'creosote','fishy','spicy', 'musty'],
          fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

Der Geruch könnte ein guter Hinweis auf die Giftigkeit eines Pilzes sein

Bruises

In [26]:
plt.figure(figsize=(15,10))
plt.title("Bruises", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.bruises)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);
In [27]:
plt.figure(figsize=(15,10))
plt.title("Gill Attachment", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.gill_attachment)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);
In [28]:
plt.figure(figsize=(15,10))
plt.title("Gill Color", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.gill_color)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);
In [29]:
plt.figure(figsize=(15,10))
plt.title("Ring Type", fontsize = 25)

ax = sns.countplot(data=df, x=df.target, hue=df.ring_type)

ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)

ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc=2);

Umwandeln der kategorischen Variablen

Eine der Kategorien ist jeweils die Baseline und wird entfernt, um Kollinearität zu vermeiden. Der Einfachheit halber wird die abhängige Variable 'target_poisonous' in target umbenannt.

In [30]:
df_ = pd.get_dummies(df, drop_first = True).rename(columns={'target_poisonous':'target'})
In [31]:
df_.shape, df.shape
Out[31]:
((8124, 96), (8124, 23))
In [32]:
df_.head()
Out[32]:
target cap_shape_Conical cap_shape_Convex cap_shape_Flat cap_shape_Knobbed cap_shape_Sunken cap_surface_g cap_surface_s cap_surface_y cap_color_c cap_color_e cap_color_g cap_color_n cap_color_p cap_color_r cap_color_u cap_color_w cap_color_y bruises_no bruises odor_c odor_f odor_l odor_m odor_n odor_p odor_s odor_y gill_attachment_free gill_spacing_w gill_size_n gill_color_brown gill_color_buff gill_color_chocolate gill_color_gray gill_color_green gill_color_orange gill_color_pink gill_color_purple gill_color_red gill_color_white gill_color_yellow stalk_shape_t stalk_root_b stalk_root_c stalk_root_e stalk_root_r stalk_surface_above_ring_k stalk_surface_above_ring_s stalk_surface_above_ring_y stalk_surface_below_ring_k stalk_surface_below_ring_s stalk_surface_below_ring_y stalk_color_above_ring_c stalk_color_above_ring_e stalk_color_above_ring_g stalk_color_above_ring_n stalk_color_above_ring_o stalk_color_above_ring_p stalk_color_above_ring_w stalk_color_above_ring_y stalk_color_below_ring_c stalk_color_below_ring_e stalk_color_below_ring_g stalk_color_below_ring_n stalk_color_below_ring_o stalk_color_below_ring_p stalk_color_below_ring_w stalk_color_below_ring_y veil_color_o veil_color_w veil_color_y ring_number_o ring_number_t ring_type_flaring ring_type_large ring_type_none ring_type_pendant spore_print_color_h spore_print_color_k spore_print_color_n spore_print_color_o spore_print_color_r spore_print_color_u spore_print_color_w spore_print_color_y population_c population_n population_s population_v population_y habitat_g habitat_l habitat_m habitat_p habitat_u habitat_w
0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0
3 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
4 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0

Korrelationen mit der abhängigen Variablen

In [33]:
corr = df_.corr()['target'].sort_values(ascending=False)

Die wichtigsten 20 Variablen, die am stärksten mit dem Ziel korreliert sind.

In [34]:
corr20 = abs(corr).head(20)
In [35]:
list(corr20.index)
Out[35]:
['target',
 'odor_f',
 'stalk_surface_above_ring_k',
 'stalk_surface_below_ring_k',
 'gill_size_n',
 'gill_color_buff',
 'bruises_no bruises',
 'spore_print_color_h',
 'ring_type_large',
 'population_v',
 'spore_print_color_w',
 'habitat_p',
 'odor_y',
 'odor_s',
 'stalk_color_above_ring_n',
 'stalk_color_below_ring_p',
 'stalk_color_above_ring_p',
 'stalk_color_below_ring_n',
 'odor_p',
 'ring_number_o']

DataFrame speichern

In [36]:
#df_.to_csv('df_clean.csv')

Dimensionality Reduction

In [ ]:
#df_if = df_[list(corr20.index)]
#df_if.to_csv('df_if.csv')