import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Klassifikation\Klassifikation_West Nile Virus')
import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 15
from IPython.core.pylabtools import figsize
figsize(10, 10)
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
train = pd.read_csv('train.csv')
train.head()
train.dtypes.sort_values()
train = train.drop(['Address', 'Block', 'Street',
'AddressNumberAndStreet', 'AddressAccuracy'], axis=1)
train.Species.value_counts()
train.groupby('Species')['WnvPresent'].mean()
train.Date = pd.to_datetime(train.Date)
train.Date.dt.year.value_counts()
train.WnvPresent.value_counts()
# Visualisierung mit plotly
target= train.WnvPresent.value_counts()
levels = ['0','1']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green')))
layout = dict(title="Ratio positiver Fälle", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)
sieben = train.loc[train.Date.dt.year == 2007]
sieben['Month'] = sieben.Date.dt.month
sieben.head()
sieben.groupby('Month')['NumMosquitos'].sum()
monat_mosk = sieben.groupby('Month')['NumMosquitos'].sum()
monat_sick = sieben.groupby('Month')['WnvPresent'].sum()
trace1 = go.Bar(x=monat_mosk.index,
y=monat_mosk.values,
marker = dict(color = 'red'),
name = 'Anzahl Moskitos')
trace2 = go.Bar(x=monat_sick.index,
y=monat_sick.values,
marker = dict(color = 'blue'),
name = 'Anzahl infizierter Moskitos')
data1 = [trace1, trace2]
layout1 = go.Layout(title = "Verlauf Moskitopupulation in 2007",
barmode='group')
fig = dict(data=data1, layout=layout1)
iplot(fig)
verlauf_jahr = train.groupby(train.Date.dt.year)['WnvPresent'].sum()
trace2 = go.Bar(x=verlauf_jahr.index,
y=verlauf_jahr.values,
marker = dict(color = 'blue'),
name = 'Anzahl Infizierter Moskitos')
data1 = [trace2]
layout = go.Layout(
template= 'plotly_dark',
title = "Häufigkeit WNV Präsenz",
xaxis=dict(
title='Jahr',
titlefont=dict(
family='Arial, sans-serif',
size=18,
color='lightgrey'),
showticklabels=True,
tickangle=0,
tickfont=dict(
family='Old Standard TT, serif',
size=14,
color='white'),
exponentformat='e',
showexponent='all'),
yaxis=dict(
title='Anzahl',
titlefont=dict(
family='Arial, sans-serif',
size=18,
color='lightgrey'
),
showticklabels=True,
tickangle=0,
tickfont=dict(
family='Old Standard TT, serif',
size=14,
color='white'
),
exponentformat='e',
showexponent='all'
)
)
fig = dict(data=data1, layout=layout)
iplot(fig)
import folium
trap_locs = train[['Trap', 'Longitude', 'Latitude']].drop_duplicates(subset='Trap')
train.loc[train.Trap == 'T138', ['Longitude', 'Latitude']].head().iloc[0]
#tiles="CartoDB dark_matter",
mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
for index, row in trap_locs.iterrows():
folium.Circle(location=(row["Latitude"], row["Longitude"]), popup = row['Trap'], radius = 100, color='blue', fill=True,
fill_color='blue').add_to(mp)
folium.Circle(location = (41.974689,-87.890615), popup = 'T900', radius = 700, color='red', fill=True,
fill_color='red').add_to(mp)
folium.Circle(location = (41.673408,-87.599862), popup = 'T115', radius = 700, color='red', fill=True,
fill_color='red').add_to(mp)
folium.Circle(location = (41.95469, -87.800991), popup = 'T002', radius = 700, color='red', fill=True,
fill_color='red').add_to(mp)
folium.Circle(location = (41.726465,-87.585413), popup = 'T138', radius = 700, color='red', fill=True,
fill_color='red').add_to(mp)
mp
from folium.plugins import HeatMap
data = train.groupby(['Latitude', 'Longitude'])['NumMosquitos'].sum().reset_index().values.tolist()
mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
HeatMap(data=data, radius=8, max_zoom=13).add_to(mp)
mp
from folium.plugins import HeatMapWithTime
years = train.Date.dt.year.sort_values().unique()
year_list = []
for i in years:
s = train.loc[train.Date.dt.year == i, ['Latitude', 'Longitude', 'NumMosquitos']].groupby(['Latitude', 'Longitude'])['NumMosquitos'].sum().reset_index().values.tolist()
year_list.append(s)
mp = folium.Map(location=[41.8, -87.5], zoom_start=10)
HeatMapWithTime(year_list, radius=5, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'},
min_opacity=0.5, max_opacity=0.8, use_local_extrema=True).add_to(mp)
mp
Da sich die Anzahl der Moskitos auf die verschiednenen Traps verteilen, ist hier nicht viel zu sehen.
Diese werden 'Hot Spots' und die Entfernung dazu ein neues Feature.
Teste erstmal einen Hotspot
train.groupby('Trap')['NumMosquitos'].sum().sort_values(ascending=False).head()
train.groupby('Trap')['WnvPresent'].sum().sort_values(ascending=False).head()
train.NumMosquitos.sum()
from haversine import haversine
# Entfernung in km
haversine((41.954690, -87.800991),(41.974689,-87.890615))
T900 = tuple(train.loc[train.Trap == 'T900', ['Latitude', 'Longitude']].iloc[0].values.tolist())
T115 = tuple(train.loc[train.Trap == 'T115', ['Latitude', 'Longitude']].iloc[0].values.tolist())
T138 = tuple(train.loc[train.Trap == 'T138', ['Latitude', 'Longitude']].iloc[0].values.tolist())
def distance(row, Trap):
start = (row['Latitude'], row['Longitude'])
stop = Trap
return haversine(start, stop)
train['dist_T900'] = train.apply(lambda row: distance(row, T900), axis=1)
train['dist_T115'] = train.apply(lambda row: distance(row, T115), axis=1)
train['dist_T138'] = train.apply(lambda row: distance(row, T138), axis=1)
train['NumMosq_3'] = (train.NumMosquitos)**3
train.head()
train_ = pd.get_dummies(train, columns = ['Species'], prefix_sep='_', drop_first=True)
train_.head()
train.loc[(train.Date.dt.year == 2011) & (train.Date.dt.month == 8)].head()
#train_.to_csv('train_final.csv', sep=',', encoding='utf-8', index=False)