import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Klassifikation_West Nile Virus')

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

Train einlesen¶

train = pd.read_csv('train.csv')
train.head()

train.dtypes.sort_values()

Block                       int64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
Latitude                  float64
Longitude                 float64
Date                       object
Address                    object
Species                    object
Street                     object
Trap                       object
AddressNumberAndStreet     object
dtype: object

Dimensionality Reduction¶

train = train.drop(['Address', 'Block', 'Street', 
                    'AddressNumberAndStreet', 'AddressAccuracy'], axis=1)

Überblick¶

train.Species.value_counts()

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64

train.groupby('Species')['WnvPresent'].mean()

Species
CULEX ERRATICUS           0.000000
CULEX PIPIENS             0.088922
CULEX PIPIENS/RESTUANS    0.055135
CULEX RESTUANS            0.017883
CULEX SALINARIUS          0.000000
CULEX TARSALIS            0.000000
CULEX TERRITANS           0.000000
Name: WnvPresent, dtype: float64

train.Date = pd.to_datetime(train.Date)

train.Date.dt.year.value_counts()

2007    3811
2013    2392
2009    2249
2011    2054
Name: Date, dtype: int64

train.WnvPresent.value_counts()

0    9955
1     551
Name: WnvPresent, dtype: int64

# Visualisierung mit plotly
target= train.WnvPresent.value_counts()
levels = ['0','1']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green')))
layout = dict(title="Ratio positiver Fälle", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)

0    9955
1     551
Name: WnvPresent, dtype: int64

sieben = train.loc[train.Date.dt.year == 2007]

sieben['Month'] = sieben.Date.dt.month

sieben.head()

sieben.groupby('Month')['NumMosquitos'].sum()

Month
5        40
6       428
7      7199
8     40015
9      9300
10     1706
Name: NumMosquitos, dtype: int64

monat_mosk = sieben.groupby('Month')['NumMosquitos'].sum()
monat_sick = sieben.groupby('Month')['WnvPresent'].sum()

trace1 = go.Bar(x=monat_mosk.index,
                y=monat_mosk.values,
                marker = dict(color = 'red'),
                name = 'Anzahl Moskitos')

trace2 = go.Bar(x=monat_sick.index,
                y=monat_sick.values,
                marker = dict(color = 'blue'),
                name = 'Anzahl infizierter Moskitos')
                

data1 = [trace1, trace2]

layout1 = go.Layout(title = "Verlauf Moskitopupulation in 2007",
                    barmode='group')

fig = dict(data=data1, layout=layout1)
iplot(fig)

verlauf_jahr = train.groupby(train.Date.dt.year)['WnvPresent'].sum()

trace2 = go.Bar(x=verlauf_jahr.index,
                y=verlauf_jahr.values,
                marker = dict(color = 'blue'),
                name = 'Anzahl Infizierter Moskitos')
                

data1 = [trace2]

layout = go.Layout(
    template= 'plotly_dark',
    title = "Häufigkeit WNV Präsenz",
    xaxis=dict(
       
        title='Jahr',
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='white'),
        exponentformat='e',
        showexponent='all'),
    
    yaxis=dict(
        title='Anzahl',
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='white'
        ),
        exponentformat='e',
        showexponent='all'
    )
)

fig = dict(data=data1, layout=layout)
iplot(fig)

Lage der Fallen¶

import folium

trap_locs = train[['Trap', 'Longitude', 'Latitude']].drop_duplicates(subset='Trap')

train.loc[train.Trap == 'T138', ['Longitude', 'Latitude']].head().iloc[0]

Longitude   -87.585413
Latitude     41.726465
Name: 530, dtype: float64

#tiles="CartoDB dark_matter",
mp = folium.Map(location=[41.8, -87.5],  zoom_start=10)

for index, row in trap_locs.iterrows():
    folium.Circle(location=(row["Latitude"], row["Longitude"]), popup = row['Trap'], radius = 100, color='blue', fill=True,
      fill_color='blue').add_to(mp)
folium.Circle(location = (41.974689,-87.890615), popup = 'T900', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)
folium.Circle(location = (41.673408,-87.599862), popup = 'T115', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)  
folium.Circle(location = (41.95469, -87.800991), popup = 'T002', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)
folium.Circle(location = (41.726465,-87.585413), popup = 'T138', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)


mp

Anzahl Moskitos über alle Jahre in der Stadt¶

from folium.plugins import HeatMap

data = train.groupby(['Latitude', 'Longitude'])['NumMosquitos'].sum().reset_index().values.tolist()

mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
HeatMap(data=data, radius=8, max_zoom=13).add_to(mp)
mp

Änderung der Anzahl der Moskitos in den Fallen über die Jahre¶

from folium.plugins import HeatMapWithTime

years = train.Date.dt.year.sort_values().unique()

year_list = []
for i in years:
    s = train.loc[train.Date.dt.year == i, ['Latitude', 'Longitude', 'NumMosquitos']].groupby(['Latitude', 'Longitude'])['NumMosquitos'].sum().reset_index().values.tolist()
    year_list.append(s)

mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
HeatMapWithTime(year_list, radius=5, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}, 
                min_opacity=0.5, max_opacity=0.8, use_local_extrema=True).add_to(mp)
mp

Da sich die Anzahl der Moskitos auf die verschiednenen Traps verteilen, ist hier nicht viel zu sehen.

Feature Engineering¶

In welchen Fallen tummeln sich die meisten Moskitos?¶

Diese werden 'Hot Spots' und die Entfernung dazu ein neues Feature.
Teste erstmal einen Hotspot

train.groupby('Trap')['NumMosquitos'].sum().sort_values(ascending=False).head()

Trap
T115    21668
T900    15386
T138     9936
T002     3710
T128     3315
Name: NumMosquitos, dtype: int64

train.groupby('Trap')['WnvPresent'].sum().sort_values(ascending=False).head()

Trap
T900    66
T115    41
T002    18
T138    16
T003    14
Name: WnvPresent, dtype: int64

train.NumMosquitos.sum()

135039

Distance to Hotspots¶

from haversine import haversine
# Entfernung in km

haversine((41.954690, -87.800991),(41.974689,-87.890615))

7.736591702562066

T900 = tuple(train.loc[train.Trap == 'T900', ['Latitude', 'Longitude']].iloc[0].values.tolist())
T115 = tuple(train.loc[train.Trap == 'T115', ['Latitude', 'Longitude']].iloc[0].values.tolist())
T138 = tuple(train.loc[train.Trap == 'T138', ['Latitude', 'Longitude']].iloc[0].values.tolist())

def distance(row, Trap):
    start = (row['Latitude'], row['Longitude'])
    stop = Trap
    return haversine(start, stop)

train['dist_T900'] = train.apply(lambda row: distance(row, T900), axis=1)
train['dist_T115'] = train.apply(lambda row: distance(row, T115), axis=1)
train['dist_T138'] = train.apply(lambda row: distance(row, T138), axis=1)

Anzahl der Moskitos pro Falle potenzieren¶

train['NumMosq_3'] = (train.NumMosquitos)**3

train.head()

Kategorische Features in numerische umwandeln¶

train_ = pd.get_dummies(train, columns = ['Species'], prefix_sep='_', drop_first=True)

train_.head()

train.loc[(train.Date.dt.year == 2011) & (train.Date.dt.month == 8)].head()

Speichern von train¶

#train_.to_csv('train_final.csv', sep=',', encoding='utf-8', index=False)

	Date	Address	Species	Block	Street	Trap	AddressNumberAndStreet	Latitude	Longitude	AddressAccuracy	NumMosquitos
0	2007-05-29	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX PIPIENS/RESTUANS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.954690	-87.800991	9	1
1	2007-05-29	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX RESTUANS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.954690	-87.800991	9	1
2	2007-05-29	6200 North Mandell Avenue, Chicago, IL 60646, USA	CULEX RESTUANS	62	N MANDELL AVE	T007	6200 N MANDELL AVE, Chicago, IL	41.994991	-87.769279	9	1
3	2007-05-29	7900 West Foster Avenue, Chicago, IL 60656, USA	CULEX PIPIENS/RESTUANS	79	W FOSTER AVE	T015	7900 W FOSTER AVE, Chicago, IL	41.974089	-87.824812	8	1
4	2007-05-29	7900 West Foster Avenue, Chicago, IL 60656, USA	CULEX RESTUANS	79	W FOSTER AVE	T015	7900 W FOSTER AVE, Chicago, IL	41.974089	-87.824812	8	4

	Date	Species	Trap	Latitude	Longitude	NumMosquitos	dist_T900	dist_T115	dist_T138	NumMosq_3
0	2007-05-29	CULEX PIPIENS/RESTUANS	T002	41.954690	-87.800991	1	7.736592	35.441519	31.031386	1
1	2007-05-29	CULEX RESTUANS	T002	41.954690	-87.800991	1	7.736592	35.441519	31.031386	1
2	2007-05-29	CULEX RESTUANS	T007	41.994991	-87.769279	1	10.279811	38.414514	33.517161	1
3	2007-05-29	CULEX PIPIENS/RESTUANS	T015	41.974089	-87.824812	1	5.440165	38.279145	33.931386	1
4	2007-05-29	CULEX RESTUANS	T015	41.974089	-87.824812	4	5.440165	38.279145	33.931386	64

	Date	Trap	Latitude	Longitude	NumMosquitos	dist_T900	dist_T115	dist_T138	NumMosq_3	Species_CULEX PIPIENS/RESTUANS	Species_CULEX RESTUANS
0	2007-05-29	T002	41.954690	-87.800991	1	7.736592	35.441519	31.031386	1	1	0
1	2007-05-29	T002	41.954690	-87.800991	1	7.736592	35.441519	31.031386	1	0	1
2	2007-05-29	T007	41.994991	-87.769279	1	10.279811	38.414514	33.517161	1	0	1
3	2007-05-29	T015	41.974089	-87.824812	1	5.440165	38.279145	33.931386	1	1	0
4	2007-05-29	T015	41.974089	-87.824812	4	5.440165	38.279145	33.931386	64	0	1

	Date	Species	Trap	Latitude	Longitude	NumMosquitos	dist_T900	dist_T115	dist_T138	NumMosq_3
7081	2011-08-05	CULEX PIPIENS/RESTUANS	T002	41.954690	-87.800991	21	7.736592	35.441519	31.031386	9261
7082	2011-08-05	CULEX RESTUANS	T002	41.954690	-87.800991	3	7.736592	35.441519	31.031386	27
7083	2011-08-05	CULEX RESTUANS	T046	41.891118	-87.654491	2	21.630298	24.628405	19.182912	8
7084	2011-08-05	CULEX PIPIENS/RESTUANS	T048	41.867108	-87.654224	11	22.926487	22.005259	16.646646	1331
7085	2011-08-05	CULEX PIPIENS/RESTUANS	T054	41.921965	-87.632085	1	22.169921	27.767095	22.079959	1