In [1]:
# Pandas und Numpy importieren
import pandas as pd
import numpy as np

# Visualisierungen
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
plt.style.use('Solarize_Light2')

# Karten
import folium

# Interface zum System
import os
os.chdir('D:\Data\Projects\Regression\Taxi Fare Prediction_Linear Regression')

# Datum und Zeit
import datetime as dt
from datetime import datetime
In [2]:
data = pd.read_csv('train_dt_clean.csv')
In [3]:
data.tail()
Out[3]:
fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
4876388 16.5 2011-01-24 21:33:44+00:00 -74.003883 40.725772 -73.969391 40.800830 1
4876389 9.0 2013-10-11 12:12:00+00:00 -73.995105 40.739897 -73.985217 40.731950 2
4876390 10.5 2014-12-06 23:04:28+00:00 -73.981063 40.764125 -73.979259 40.781857 2
4876391 10.0 2015-05-30 19:01:24+00:00 -73.965401 40.759140 -73.971886 40.750870 1
4876392 4.9 2012-07-11 08:12:00+00:00 -73.972595 40.743177 -73.965820 40.754412 6
In [4]:
data.dtypes
Out[4]:
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object
In [5]:
# pickup_datetime in Format datetime umwandeln
data['pickup_datetime']= pd.to_datetime(data.pickup_datetime)
In [6]:
data.dtypes
Out[6]:
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object
In [7]:
data.describe()
Out[7]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
count 4.876393e+06 4.876393e+06 4.876393e+06 4.876393e+06 4.876393e+06 4.876393e+06
mean 1.135564e+01 -7.397524e+01 4.075107e+01 -7.397439e+01 4.075144e+01 1.685362e+00
std 9.717050e+00 3.887660e-02 2.990668e-02 3.802680e-02 3.308770e-02 1.308125e+00
min 2.510000e+00 -7.498856e+01 4.003383e+01 -7.499828e+01 4.000565e+01 0.000000e+00
25% 6.000000e+00 -7.399228e+01 4.073657e+01 -7.399159e+01 4.073562e+01 1.000000e+00
50% 8.500000e+00 -7.398211e+01 4.075337e+01 -7.398062e+01 4.075388e+01 1.000000e+00
75% 1.250000e+01 -7.396837e+01 4.076756e+01 -7.396541e+01 4.076842e+01 2.000000e+00
max 9.520000e+02 -7.206320e+01 4.192279e+01 -7.206700e+01 4.199811e+01 9.000000e+00
In [8]:
# Verteilung des Targets mit Matplotlib Histogramm
plt.hist(data.fare_amount, color = 'blue', edgecolor = 'black', bins = 100);

Bessere Darstellung des Fahrpreises auf einer Log-Skala

In [9]:
# Fares log
#data['log10_fare'] = np.log10(data['fare_amount'])
In [10]:
# Graph der Fares auf Log Skala
#plt.hist(data.log10_fare, color = 'blue', edgecolor = 'black', bins = 50);
In [11]:
# ohne neue Spalte
logs = np.log10(data['fare_amount'])
plt.figure(figsize = (10, 8))
plt.hist(logs, color = 'red', edgecolor = 'white', bins = 50);
In [12]:
# Die ersten 1000 Pickup und Dropoff Orte auf Karte visualisieren mit Folium

df= data[:1000]

m = folium.Map(location = [40.75,-74], tiles='CartoDB dark_matter', zoom_start=11)

for i, row in df.iterrows():
    folium.Circle(location = [row['pickup_latitude'], row['pickup_longitude']], radius=[row['fare_amount']]).add_to(m)
    
for i, row in df.iterrows():
    folium.Circle(location = [row['dropoff_latitude'], row['dropoff_longitude']], radius=[row['fare_amount']], color='red').add_to(m)
m
Out[12]:
In [13]:
# Ist der Preis abhängig von der  Anzahl der Fahrtgäste?
sns.lmplot(x='passenger_count', y= 'fare_amount', data= df);

Feature Engineering

Um Trends in einem Time Series sichtbar zu machen, kann es sinnvoll sein, aus einer Zeit und Datum- Spalte Informationen zu extrahieren.

In [14]:
# Features aus datetime
data['year'] = data.pickup_datetime.dt.year
data['month'] = data.pickup_datetime.dt.month
data['day'] = data.pickup_datetime.dt.day
data['hour'] = data.pickup_datetime.dt.hour
data['minute'] = data.pickup_datetime.dt.minute
In [15]:
data.year.value_counts()
Out[15]:
2012    782408
2011    775414
2013    762125
2009    756806
2010    734302
2014    724549
2015    340789
Name: year, dtype: int64

Eine neue Variable mit Haversine hinzufügen

In [16]:
# Neues Feature: Streckenlänge mit haversine (nur relativ, da es keine genaue Distanz ist)
import haversine
data["dist"] = data.apply(lambda row : haversine.haversine((row["pickup_latitude"], row["pickup_longitude"]),
                                                           (row["dropoff_latitude"], row["dropoff_longitude"])), axis=1)

Korrelation der Variablen mit dem Preis

In [17]:
data.corr()['fare_amount']
Out[17]:
fare_amount          1.000000
pickup_longitude     0.379446
pickup_latitude     -0.189901
dropoff_longitude    0.283192
dropoff_latitude    -0.153166
passenger_count      0.013504
year                 0.115962
month                0.024774
day                  0.001003
hour                -0.017668
minute              -0.007596
dist                 0.816346
Name: fare_amount, dtype: float64
In [19]:
corrs = data.corr()['fare_amount']
corrs = abs(corrs)
corrs = corrs.sort_values(ascending=False)[1:]
plt.figure(figsize = (10, 8))
sns.barplot(y=corrs.index, x=corrs.values, palette="Greens_d", orient='h');
plt.title('Korrelation der Variablen mit dem Fahrpreis');

Heatmap der Korrelationen

In [20]:
corrs = data.corr()
plt.rcParams['font.size'] = 12
plt.figure(figsize = (12, 12))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f');
In [21]:
# neues Feature: Uhrzeiten zu denen die meisten Pickups sind, könnten  Stauzeiten sein, längere Fahrt teurer?
df = data.loc[data.year == 2015]
In [22]:
df.groupby('hour').size().plot.bar(color = 'b');
In [23]:
# die meisten Pickups gibt es um 19h
In [24]:
hour_fare= df.groupby('hour')['fare_amount'].mean()
hour_dist= df.groupby('hour')['dist'].mean()
hour_fare.plot.bar(color = 'b'); 
In [25]:
hour_dist.plot.bar(color = 'b');
In [26]:
# Nachts werden für längere stecken Taxis genommen, daher ist es teurer. die Uhrzeit macht weniger aus.
In [27]:
#data.to_csv('train_clean_features.csv', sep=',', encoding='utf-8', index=False)