In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import os
os.chdir('D:\Data\Projects\Klassifikation\Klassifikation_West Nile Virus')

import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

Train einlesen

In [2]:
train = pd.read_csv('train.csv')
train.head()
Out[2]:
Date Address Species Block Street Trap AddressNumberAndStreet Latitude Longitude AddressAccuracy NumMosquitos WnvPresent
0 2007-05-29 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX PIPIENS/RESTUANS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.954690 -87.800991 9 1 0
1 2007-05-29 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX RESTUANS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.954690 -87.800991 9 1 0
2 2007-05-29 6200 North Mandell Avenue, Chicago, IL 60646, USA CULEX RESTUANS 62 N MANDELL AVE T007 6200 N MANDELL AVE, Chicago, IL 41.994991 -87.769279 9 1 0
3 2007-05-29 7900 West Foster Avenue, Chicago, IL 60656, USA CULEX PIPIENS/RESTUANS 79 W FOSTER AVE T015 7900 W FOSTER AVE, Chicago, IL 41.974089 -87.824812 8 1 0
4 2007-05-29 7900 West Foster Avenue, Chicago, IL 60656, USA CULEX RESTUANS 79 W FOSTER AVE T015 7900 W FOSTER AVE, Chicago, IL 41.974089 -87.824812 8 4 0
In [3]:
train.dtypes.sort_values()
Out[3]:
Block                       int64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
Latitude                  float64
Longitude                 float64
Date                       object
Address                    object
Species                    object
Street                     object
Trap                       object
AddressNumberAndStreet     object
dtype: object

Dimensionality Reduction

In [4]:
train = train.drop(['Address', 'Block', 'Street', 
                    'AddressNumberAndStreet', 'AddressAccuracy'], axis=1)

Überblick

In [5]:
train.Species.value_counts()
Out[5]:
CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64
In [6]:
train.groupby('Species')['WnvPresent'].mean()
Out[6]:
Species
CULEX ERRATICUS           0.000000
CULEX PIPIENS             0.088922
CULEX PIPIENS/RESTUANS    0.055135
CULEX RESTUANS            0.017883
CULEX SALINARIUS          0.000000
CULEX TARSALIS            0.000000
CULEX TERRITANS           0.000000
Name: WnvPresent, dtype: float64
In [7]:
train.Date = pd.to_datetime(train.Date)
In [8]:
train.Date.dt.year.value_counts()
Out[8]:
2007    3811
2013    2392
2009    2249
2011    2054
Name: Date, dtype: int64
In [9]:
train.WnvPresent.value_counts()
Out[9]:
0    9955
1     551
Name: WnvPresent, dtype: int64
In [10]:
# Visualisierung mit plotly
target= train.WnvPresent.value_counts()
levels = ['0','1']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green')))
layout = dict(title="Ratio positiver Fälle", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)
0    9955
1     551
Name: WnvPresent, dtype: int64
In [11]:
sieben = train.loc[train.Date.dt.year == 2007]
In [12]:
sieben['Month'] = sieben.Date.dt.month
In [13]:
sieben.head()
Out[13]:
Date Species Trap Latitude Longitude NumMosquitos WnvPresent Month
0 2007-05-29 CULEX PIPIENS/RESTUANS T002 41.954690 -87.800991 1 0 5
1 2007-05-29 CULEX RESTUANS T002 41.954690 -87.800991 1 0 5
2 2007-05-29 CULEX RESTUANS T007 41.994991 -87.769279 1 0 5
3 2007-05-29 CULEX PIPIENS/RESTUANS T015 41.974089 -87.824812 1 0 5
4 2007-05-29 CULEX RESTUANS T015 41.974089 -87.824812 4 0 5
In [14]:
sieben.groupby('Month')['NumMosquitos'].sum()
Out[14]:
Month
5        40
6       428
7      7199
8     40015
9      9300
10     1706
Name: NumMosquitos, dtype: int64
In [15]:
monat_mosk = sieben.groupby('Month')['NumMosquitos'].sum()
monat_sick = sieben.groupby('Month')['WnvPresent'].sum()
In [16]:
trace1 = go.Bar(x=monat_mosk.index,
                y=monat_mosk.values,
                marker = dict(color = 'red'),
                name = 'Anzahl Moskitos')

trace2 = go.Bar(x=monat_sick.index,
                y=monat_sick.values,
                marker = dict(color = 'blue'),
                name = 'Anzahl infizierter Moskitos')
                

data1 = [trace1, trace2]

layout1 = go.Layout(title = "Verlauf Moskitopupulation in 2007",
                    barmode='group')

fig = dict(data=data1, layout=layout1)
iplot(fig)
In [17]:
verlauf_jahr = train.groupby(train.Date.dt.year)['WnvPresent'].sum()
In [18]:
trace2 = go.Bar(x=verlauf_jahr.index,
                y=verlauf_jahr.values,
                marker = dict(color = 'blue'),
                name = 'Anzahl Infizierter Moskitos')
                

data1 = [trace2]

layout = go.Layout(
    template= 'plotly_dark',
    title = "Häufigkeit WNV Präsenz",
    xaxis=dict(
       
        title='Jahr',
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='white'),
        exponentformat='e',
        showexponent='all'),
    
    yaxis=dict(
        title='Anzahl',
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='white'
        ),
        exponentformat='e',
        showexponent='all'
    )
)

fig = dict(data=data1, layout=layout)
iplot(fig)

Lage der Fallen

In [19]:
import folium
In [20]:
trap_locs = train[['Trap', 'Longitude', 'Latitude']].drop_duplicates(subset='Trap')
In [21]:
train.loc[train.Trap == 'T138', ['Longitude', 'Latitude']].head().iloc[0]
Out[21]:
Longitude   -87.585413
Latitude     41.726465
Name: 530, dtype: float64
In [22]:
#tiles="CartoDB dark_matter",
mp = folium.Map(location=[41.8, -87.5],  zoom_start=10)

for index, row in trap_locs.iterrows():
    folium.Circle(location=(row["Latitude"], row["Longitude"]), popup = row['Trap'], radius = 100, color='blue', fill=True,
      fill_color='blue').add_to(mp)
folium.Circle(location = (41.974689,-87.890615), popup = 'T900', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)
folium.Circle(location = (41.673408,-87.599862), popup = 'T115', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)  
folium.Circle(location = (41.95469, -87.800991), popup = 'T002', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)
folium.Circle(location = (41.726465,-87.585413), popup = 'T138', radius = 700, color='red', fill=True,
      fill_color='red').add_to(mp)


mp
Out[22]:

Anzahl Moskitos über alle Jahre in der Stadt

In [23]:
from folium.plugins import HeatMap
In [24]:
data = train.groupby(['Latitude', 'Longitude'])['NumMosquitos'].sum().reset_index().values.tolist()
In [25]:
mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
HeatMap(data=data, radius=8, max_zoom=13).add_to(mp)
mp
Out[25]:

Änderung der Anzahl der Moskitos in den Fallen über die Jahre

In [26]:
from folium.plugins import HeatMapWithTime
In [27]:
years = train.Date.dt.year.sort_values().unique()
In [28]:
year_list = []
for i in years:
    s = train.loc[train.Date.dt.year == i, ['Latitude', 'Longitude', 'NumMosquitos']].groupby(['Latitude', 'Longitude'])['NumMosquitos'].sum().reset_index().values.tolist()
    year_list.append(s)
In [29]:
mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
HeatMapWithTime(year_list, radius=5, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}, 
                min_opacity=0.5, max_opacity=0.8, use_local_extrema=True).add_to(mp)
mp
Out[29]:

Da sich die Anzahl der Moskitos auf die verschiednenen Traps verteilen, ist hier nicht viel zu sehen.

Feature Engineering

In welchen Fallen tummeln sich die meisten Moskitos?

Diese werden 'Hot Spots' und die Entfernung dazu ein neues Feature.
Teste erstmal einen Hotspot

In [30]:
train.groupby('Trap')['NumMosquitos'].sum().sort_values(ascending=False).head()
Out[30]:
Trap
T115    21668
T900    15386
T138     9936
T002     3710
T128     3315
Name: NumMosquitos, dtype: int64
In [31]:
train.groupby('Trap')['WnvPresent'].sum().sort_values(ascending=False).head()
Out[31]:
Trap
T900    66
T115    41
T002    18
T138    16
T003    14
Name: WnvPresent, dtype: int64
In [32]:
train.NumMosquitos.sum()
Out[32]:
135039

Distance to Hotspots

In [33]:
from haversine import haversine
# Entfernung in km
In [34]:
haversine((41.954690, -87.800991),(41.974689,-87.890615))
Out[34]:
7.736591702562066
In [35]:
T900 = tuple(train.loc[train.Trap == 'T900', ['Latitude', 'Longitude']].iloc[0].values.tolist())
T115 = tuple(train.loc[train.Trap == 'T115', ['Latitude', 'Longitude']].iloc[0].values.tolist())
T138 = tuple(train.loc[train.Trap == 'T138', ['Latitude', 'Longitude']].iloc[0].values.tolist())
In [36]:
def distance(row, Trap):
    start = (row['Latitude'], row['Longitude'])
    stop = Trap
    return haversine(start, stop)
In [37]:
train['dist_T900'] = train.apply(lambda row: distance(row, T900), axis=1)
train['dist_T115'] = train.apply(lambda row: distance(row, T115), axis=1)
train['dist_T138'] = train.apply(lambda row: distance(row, T138), axis=1)

Anzahl der Moskitos pro Falle potenzieren

In [38]:
train['NumMosq_3'] = (train.NumMosquitos)**3
In [39]:
train.head()
Out[39]:
Date Species Trap Latitude Longitude NumMosquitos WnvPresent dist_T900 dist_T115 dist_T138 NumMosq_3
0 2007-05-29 CULEX PIPIENS/RESTUANS T002 41.954690 -87.800991 1 0 7.736592 35.441519 31.031386 1
1 2007-05-29 CULEX RESTUANS T002 41.954690 -87.800991 1 0 7.736592 35.441519 31.031386 1
2 2007-05-29 CULEX RESTUANS T007 41.994991 -87.769279 1 0 10.279811 38.414514 33.517161 1
3 2007-05-29 CULEX PIPIENS/RESTUANS T015 41.974089 -87.824812 1 0 5.440165 38.279145 33.931386 1
4 2007-05-29 CULEX RESTUANS T015 41.974089 -87.824812 4 0 5.440165 38.279145 33.931386 64

Kategorische Features in numerische umwandeln

In [40]:
train_ = pd.get_dummies(train, columns = ['Species'], prefix_sep='_', drop_first=True)
In [41]:
train_.head()
Out[41]:
Date Trap Latitude Longitude NumMosquitos WnvPresent dist_T900 dist_T115 dist_T138 NumMosq_3 Species_CULEX PIPIENS Species_CULEX PIPIENS/RESTUANS Species_CULEX RESTUANS Species_CULEX SALINARIUS Species_CULEX TARSALIS Species_CULEX TERRITANS
0 2007-05-29 T002 41.954690 -87.800991 1 0 7.736592 35.441519 31.031386 1 0 1 0 0 0 0
1 2007-05-29 T002 41.954690 -87.800991 1 0 7.736592 35.441519 31.031386 1 0 0 1 0 0 0
2 2007-05-29 T007 41.994991 -87.769279 1 0 10.279811 38.414514 33.517161 1 0 0 1 0 0 0
3 2007-05-29 T015 41.974089 -87.824812 1 0 5.440165 38.279145 33.931386 1 0 1 0 0 0 0
4 2007-05-29 T015 41.974089 -87.824812 4 0 5.440165 38.279145 33.931386 64 0 0 1 0 0 0
In [42]:
train.loc[(train.Date.dt.year == 2011) & (train.Date.dt.month == 8)].head()
Out[42]:
Date Species Trap Latitude Longitude NumMosquitos WnvPresent dist_T900 dist_T115 dist_T138 NumMosq_3
7081 2011-08-05 CULEX PIPIENS/RESTUANS T002 41.954690 -87.800991 21 0 7.736592 35.441519 31.031386 9261
7082 2011-08-05 CULEX RESTUANS T002 41.954690 -87.800991 3 0 7.736592 35.441519 31.031386 27
7083 2011-08-05 CULEX RESTUANS T046 41.891118 -87.654491 2 0 21.630298 24.628405 19.182912 8
7084 2011-08-05 CULEX PIPIENS/RESTUANS T048 41.867108 -87.654224 11 0 22.926487 22.005259 16.646646 1331
7085 2011-08-05 CULEX PIPIENS/RESTUANS T054 41.921965 -87.632085 1 0 22.169921 27.767095 22.079959 1

Speichern von train

In [43]:
#train_.to_csv('train_final.csv', sep=',', encoding='utf-8', index=False)