In [1]:
# Pandas und Numpy importieren
import pandas as pd
import numpy as np

# Random Seed 
RSEED = 100

# Visualisierungen
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
palette = sns.color_palette('Paired', 10)
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 18

import folium

# Interface zum System
import os
os.chdir('D:\Data\Projects\Taxi Fare Prediction')

# Datum und Zeit
import datetime as dt
from datetime import datetime

# Pandas display options
# Formatierte Ausgabe als float mit drei Dezimalstellen
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [2]:
# train.CSV einlesen, da es sehr groß ist, beschränke ich mich auf 5 Millionen Beobachtungen
data = pd.read_csv('train.csv', nrows = 5_000_000 )
In [3]:
print(data.shape)
data.head()
(5000000, 8)
Out[3]:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
0 2009-06-15 17:26:21.0000001 4.500 2009-06-15 17:26:21 UTC -73.844 40.721 -73.842 40.712 1
1 2010-01-05 16:52:16.0000002 16.900 2010-01-05 16:52:16 UTC -74.016 40.711 -73.979 40.782 1
2 2011-08-18 00:35:00.00000049 5.700 2011-08-18 00:35:00 UTC -73.983 40.761 -73.991 40.751 2
3 2012-04-21 04:30:42.0000001 7.700 2012-04-21 04:30:42 UTC -73.987 40.733 -73.992 40.758 1
4 2010-03-09 07:51:00.000000135 5.300 2010-03-09 07:51:00 UTC -73.968 40.768 -73.957 40.784 1
In [4]:
# Spalte key bringt keine wesentliche Information und kann so entfernt werden
data = data.drop('key', axis= 1)
In [5]:
# pickup_datetime in datetime umwandeln
data['pickup_datetime']= pd.to_datetime(data.pickup_datetime)
In [6]:
data.dtypes
Out[6]:
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object
In [7]:
# Datensatz speichern
#data.to_csv('train_dt.csv', index= False)