import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import numpy as np

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')

import seaborn as sns

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

from warnings import filterwarnings
filterwarnings('ignore')

import os
os.chdir('D:\Data\Projects\Klassifikation\Predicting Household Poverty in Costa Rica')

train  = pd.read_csv('train.csv')
train.shape

(9557, 143)

test = pd.read_csv('test.csv')
test.shape

(23856, 142)

train.shape, test.shape

((9557, 143), (23856, 142))

test.Target = np.nan
df = train.append(test, ignore_index = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33413 entries, 0 to 33412
Columns: 143 entries, Id to v2a1
dtypes: float64(9), int64(129), object(5)
memory usage: 36.5+ MB

Datentypen¶

Alle Daten sind int64 oder float 64 bis auf die Variablen dependency, edjefa, edjefe, idhogar und Id, die im Format object vorliegen

Dependency, Edjefa, Edjefe sollten in float64 umgewandelt werden. Vorher müssen strings ersetzt werden. mapping = { 'yes':1, 'no':0}. Mit replace werden alle yes und no im train und im test Datensatz ersetzt.

df.select_dtypes('object').columns

Index(['Id', 'dependency', 'edjefa', 'edjefe', 'idhogar'], dtype='object')

Finde nicht numerische Werte mit regular expression

for col in ['dependency', 'edjefa', 'edjefe']:
    print(df[col].str.findall('[A-Za-z]').value_counts())

[]           19797
[y, e, s]     7580
[n, o]        6036
Name: dependency, dtype: int64
[n, o]       22075
[]           11124
[y, e, s]      214
Name: edjefa, dtype: int64
[]           20179
[n, o]       12818
[y, e, s]      416
Name: edjefe, dtype: int64

import re
df.edjefa.str.findall('(\D\D)').value_counts()

[no]    22075
[]      11124
[ye]      214
Name: edjefa, dtype: int64

# Wert zuweisen und in float umwandeln
mapping = {'yes': 1, 'no': 0}

df.dependency = df.dependency.replace(mapping).astype(np.float64)
df.edjefa = df.edjefa.replace(mapping).astype(np.float64)
df.edjefe = df.edjefe.replace(mapping).astype(np.float64)

df[['dependency', 'edjefa', 'edjefe']].describe()

Identifizierung von Fehlern¶

Haushalte mit unterschiedlichen Poverty Levels¶

Alle werden true, bei denen im groupby nur ein Wert im Target vorkommt.

all_equal = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

not_equal = all_equal[all_equal != True]
len(not_equal)

85

(all_equal == False).sum()

85

train[train['idhogar'] == not_equal.index[0]][['idhogar', 'parentesco1', 'Target']]

Haushalte ohne Kopf im DataFrame¶

Vorgabe: Jeder Haushalt hat einen Kopf. Alle Mitglieder eines Haushaltes sollten den gleichen Poverty Level haben, wie der Kopf des Haushaltes.

Dies ist nicht so. Bei 15 Haushalten ist niemand als Kopf des Haushaltes identifiziert, bei 85 Haushalten sind die Labels der Mitglieder unterschiedlich. Die 15 Haushalte ohne Kopf können nicht verwendet werden und werden entfernt. Die Labels der Mitglieder, die vom Label des Kopfes abweichen, werden korrigiert.

households_leader = train.groupby('idhogar')['parentesco1'].sum()
(households_leader.values == 0).sum()

15

households_leader[households_leader == 0].index

Index(['03c6bdf85', '09b195e7a', '1367ab31d', '1bc617b23', '374ca5a19',
       '61c10e099', '6b1b2405f', '896fe6d3e', 'a0812ef17', 'ad687ad89',
       'b1f4d89d7', 'bfd5067c2', 'c0c8a5013', 'd363d9183', 'f2bfa75c4'],
      dtype='object', name='idhogar')

households_no_head = train.loc[train.idhogar.isin(households_leader[households_leader == 0].index)]
households_no_head['idhogar'].nunique()

15

# Haushalte ohne leader und wo es verschiedene Targets gibt
no_head = households_no_head.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
sum(no_head == False)

0

# Ersetze Targets von Mitglidern eines Haushaltes, die von Leader abweichen
for hh in not_equal.index:
    target = train.loc[(train.idhogar == hh)&(train.parentesco1 == 1), ['Target']]
    train.loc[train.idhogar == hh, 'Target']=target

all_equal = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)
(all_equal == False).sum()

0

Fehlende Werte im kombinierten Dataframe¶

missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing.sort_values('percent', ascending = False)

V18q1 Anzahl Tablets im Haus¶

df.v18q.value_counts()

0    25468
1     7945
Name: v18q, dtype: int64

Für die Anzahl an Tablets im Haushalt wurde ein nan eingetragen, wenn es kein Tablet im Haus gibt. Dies zeigt sich, wenn man die Spalte v18q, ein Boolean für Tablet im Haushalt, untersucht. Die Anzahl für 0 ist die gleiche wie die der fehlenden Werte in Anzahl der Tablets. Daher können diese mit 0 ersetzt werden.

df.v18q1 = df.loc[(df.v18q1).isnull(), 'v18q1']= 0

V2a1 Months behind in Rent¶

[x for x in df if x.startswith('tipo')]

['tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']

own_variables = [x for x in df if x.startswith('tipo')]

# Plot of the home ownership variables for home missing rent payments
df.loc[df['v2a1'].isnull(), own_variables].sum().plot.bar(figsize = (10, 8), color = 'green', edgecolor = 'k', linewidth = 2);

plt.xticks([0, 1, 2, 3, 4], ['Owns and Paid Off', 'Owns and Paying', 'Rented', 'Precarious', 'Other'], rotation = 60)
plt.title('Kategorien mit fehlenden Werten für Mietzahlung', size = 18);

df.loc[df['v2a1'].isnull(), own_variables].sum()

tipovivi1    20844
tipovivi2        0
tipovivi3        0
tipovivi4      597
tipovivi5     2822
dtype: int64

Die Summe derer, die nicht gemietet sind, ist 24263. Damit können diese Werte mit 0 ersetzt werden.

df.v2a1 = df.v2a1.fillna(0)

rez_esc Years behind in school¶

Sind die fehlenden Werte für Schüler oder vielleicht Menschen, die nicht mehr in der Schule sind?

df.loc[df['rez_esc'].notnull()]['age'].describe()

count   5832.00
mean      12.19
std        3.20
min        7.00
25%        9.00
50%       12.00
75%       15.00
max       17.00
Name: age, dtype: float64

df.loc[(df.rez_esc).isna() & (df.age.between(7, 19))].shape

(1161, 143)

df.rez_esc.value_counts(dropna=False)

nan      27581
0.00      4474
1.00       728
2.00       336
3.00       174
4.00        80
5.00        39
99.00        1
Name: rez_esc, dtype: int64

Bei fünf Kindern im schulpflichtigen Alter sollten die Werte interpoliert werden, alle anderen werden = 0 gesetzt. Es gibt einen Ausreißer, da der maximale Wert aber bei 5 liegt, wird dieser = 5 gesetzt.

df.loc[df['rez_esc'] > 5, 'rez_esc'] = 5

df.loc[((df['age'] > 19) | (df['age'] < 7)) & (df['rez_esc'].isnull()), 'rez_esc'] = 0

df.rez_esc.mean()

0.07571623465211459

df.loc[(df.rez_esc).isna() & (df.age.between(7, 19)), 'rez_esc'] = 0

SQBmeaned und meaneduc¶

square of the mean years of education of adults (>=18) in the household
meaneduc,average years of education for adults (18+)

escolari, years of schooling
age, Age in years

Da escolari nirgends fehlt, kann daraus meaneduc und sqbmeaned berechnet werden

Fehler: Haushalte nur mit unter 18 jährigen???¶

Ja. Daher kann meaneduc nicht berechnet werden, da es nur für Erwachsene definiert wurde

Eigentlich müsste ich das Alter heruntersetzten, um den Durchschnittswert der Schulzeit für diese Haushalte zu berechnen. Da aber in der Beschreibung steht, dass meaneduc nur für 18+ gilt, geht das nicht. Daher sind diese Werte nans.

Reihen entfernen¶

df = df.drop(df[(df.meaneduc).isna()].index)
df.shape

(33377, 143)

Visualisierungen¶

# Visualisierung mit plotly
target= df.loc[df.parentesco1 == 1, 'Target'].value_counts()
levels = ['1','2', '3', '4']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green', 'blue', 'red')))
layout = dict(title="Verteilung der Zielvariablen", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target.sort_index())

1.00     222
2.00     442
3.00     355
4.00    1951
Name: Target, dtype: int64

Der Datensatz ist imbalanced, es gibt sehr viel weniger Haushalte der Klassen 1-3 als der Klasse 4.

from collections import OrderedDict

plt.figure(figsize = (20, 16))

# Farben mappen
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
poverty_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable', 4: 'non vulnerable'})

# Durch Floats iterieren
for i, col in enumerate(train.select_dtypes('float').drop('Target', axis=1)):
    ax = plt.subplot(4, 2, i + 1)
    # Iterate through the poverty levels
    for poverty_level, color in colors.items():
        # Plot each poverty level as a separate line
        sns.kdeplot(train.loc[train['Target'] == poverty_level, col].dropna(), 
                    ax = ax, color = color, label = poverty_mapping[poverty_level])
        
    plt.title(f'{col.capitalize()} Distribution'); plt.xlabel(f'{col}'); plt.ylabel('Density')

plt.subplots_adjust(top = 2)

Einfluss der Variablen auf das Target¶

def kdeplot(feature):
    plt.figure(figsize=(12, 6))
    plt.title("KDE für {}".format(feature.capitalize()))
    
    ax0 = sns.kdeplot(df[df['Target'] == 1][feature].dropna(), color= 'red', label= '1 = extreme poverty')
    ax1 = sns.kdeplot(df[df['Target'] == 2][feature].dropna(), color= 'orange', label= '2 = moderate poverty ')
    ax2 = sns.kdeplot(df[df['Target'] == 3][feature].dropna(), color= 'navy', label= '3 = vulnerable households ')
    ax3 = sns.kdeplot(df[df['Target'] == 4][feature].dropna(), color= 'green', label= '4 = non vulnerable households')

for i in ['edjefe', 'edjefa', 'meaneduc', 'age', 'escolari']:
    kdeplot(i)

plt.figure(figsize=(12, 6))

m = df[df.Target == 1]['escolari']
f = df[df.Target == 2]['escolari']
o = df[df.Target == 3]['escolari']
b = df[df.Target == 4]['escolari']

plt.hist([m, f, o, b], label=['Extreme', 'Moderate', 'Vulnerable', 'Non Vulnerable'], normed=True)
plt.legend(loc='upper right')
plt.title('Schulzeit', fontsize=25)

Text(0.5, 1.0, 'Schulzeit')

Korrelation der Variablen mit dem Target¶

corr = train.corr()['Target']

corrs = abs(corr).sort_values(ascending=False)[:20]

feature= abs(corr).sort_values(ascending=False)[:10].index.values
features = list(feature)
features

['Target',
 'meaneduc',
 'hogar_nin',
 'r4t1',
 'SQBhogar_nin',
 'escolari',
 'cielorazo',
 'epared3',
 'SQBescolari',
 'eviv3']

corr_feat = train[features].corr()
corr_feat

plt.figure(figsize = (12, 12))
sns.heatmap(corr_feat, annot = True, vmin = -1, vmax = 1, fmt = '.2f', cmap='PuBuGn');

Datensatz speichern¶

#df.to_csv('df.csv', sep = ',', encoding='utf-8', index=False)

Finde kollineare Features¶

corrs = train[['meaneduc','hogar_nin','r4t1','SQBhogar_nin']].corr()

corrs

vals=[]
cols = []
rows = []
for i in range(len(corrs)-1):
    i = i+1
    for j in range(i):
        j = j
        val = corrs.iloc[i, j]
        vals.append(val)
        col = corrs.columns[i]
        row = corrs.columns[j]
        cols.append(col)
        rows.append(row)

for val, col, row in zip(vals, cols, rows):
    print(f'{val} is the correlation of this {col} and this {row} variable')

0.034497741347410765 is the correlation of this hogar_nin and this meaneduc variable
-0.07865570808864768 is the correlation of this r4t1 and this meaneduc variable
0.7821778682997236 is the correlation of this r4t1 and this hogar_nin variable
-0.020081165423456865 is the correlation of this SQBhogar_nin and this meaneduc variable
0.884141453289388 is the correlation of this SQBhogar_nin and this hogar_nin variable
0.7313837694740167 is the correlation of this SQBhogar_nin and this r4t1 variable

Entferne eines der Features, die über Threshold 0,6 liegen

for val, col, row in zip(vals, cols, rows):
    if val > 0.6:
        print(col, row)

r4t1 hogar_nin
SQBhogar_nin hogar_nin
SQBhogar_nin r4t1

vals_=[]
cols_ = []
rows_ = []
for i in range(len(corrs)-1):
    i = i+1
    for j in range(i):
        val = corrs.iloc[i, j]
        if val > 0.6:
            vals_.append(val)
            col = corrs.columns[i]
            row = corrs.columns[j]
            cols_.append(col)
            rows_.append(row)

for val, col, row in zip(vals_, cols_, rows_):
    print(f'{val} is the correlation between ***{col}*** and ***{row}***')

0.7821778682997236 is the correlation between ***r4t1*** and ***hogar_nin***
0.884141453289388 is the correlation between ***SQBhogar_nin*** and ***hogar_nin***
0.7313837694740167 is the correlation between ***SQBhogar_nin*** and ***r4t1***

	idhogar	parentesco1	Target
7651	0172ab1d9	0	3
7652	0172ab1d9	0	2
7653	0172ab1d9	0	3
7654	0172ab1d9	1	3
7655	0172ab1d9	0	2

	Target	meaneduc	hogar_nin	r4t1	SQBhogar_nin	escolari	cielorazo	epared3	SQBescolari	eviv3
Target	1.00	0.34	-0.32	-0.32	-0.31	0.30	0.30	0.30	0.30	0.29
meaneduc	0.34	1.00	0.03	-0.08	-0.02	0.52	0.32	0.29	0.56	0.26
hogar_nin	-0.32	0.03	1.00	0.78	0.88	-0.25	-0.18	-0.15	-0.23	-0.12
r4t1	-0.32	-0.08	0.78	1.00	0.73	-0.31	-0.16	-0.13	-0.24	-0.11
SQBhogar_nin	-0.31	-0.02	0.88	0.73	1.00	-0.20	-0.16	-0.13	-0.18	-0.10
escolari	0.30	0.52	-0.25	-0.31	-0.20	1.00	0.25	0.23	0.94	0.20
cielorazo	0.30	0.32	-0.18	-0.16	-0.16	0.25	1.00	0.39	0.26	0.44
epared3	0.30	0.29	-0.15	-0.13	-0.13	0.23	0.39	1.00	0.25	0.66
SQBescolari	0.30	0.56	-0.23	-0.24	-0.18	0.94	0.26	0.25	1.00	0.22
eviv3	0.29	0.26	-0.12	-0.11	-0.10	0.20	0.44	0.66	0.22	1.00

	dependency	edjefa	edjefe
count	33413.00	33413.00	33413.00
mean	1.17	2.83	5.17
std	1.65	4.61	5.21
min	0.00	0.00	0.00
25%	0.33	0.00	0.00
50%	0.67	0.00	6.00
75%	1.33	6.00	9.00
max	8.00	21.00	21.00

	total	percent
rez_esc	27581	82.55
v18q1	25468	76.22
v2a1	24263	72.62
Target	23856	71.40
SQBmeaned	36	0.11
meaneduc	36	0.11