# Pandas und Numpy importieren
import pandas as pd
import numpy as np

# Random Seed 
RSEED = 100

# Visualisierungen
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
palette = sns.color_palette('Paired', 10)
plt.style.use('Solarize_Light2')
plt.rcParams['font.size'] = 18

import folium

# Interface zum System
import os
os.chdir('D:\Data\Projects\Taxi Fare Prediction')

# Datum und Zeit
import datetime as dt
from datetime import datetime

# Pandas display options
# Formatierte Ausgabe als float mit drei Dezimalstellen
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# train.CSV einlesen, da es sehr groß ist, beschränke ich mich auf 5 Millionen Beobachtungen
data = pd.read_csv('train.csv', nrows = 5_000_000 )

print(data.shape)
data.head()

(5000000, 8)

# Spalte key bringt keine wesentliche Information und kann so entfernt werden
data = data.drop('key', axis= 1)

# pickup_datetime in datetime umwandeln
data['pickup_datetime']= pd.to_datetime(data.pickup_datetime)

data.dtypes

fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

# Datensatz speichern
#data.to_csv('train_dt.csv', index= False)

	key	fare_amount	pickup_datetime	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count
0	2009-06-15 17:26:21.0000001	4.500	2009-06-15 17:26:21 UTC	-73.844	40.721	-73.842	40.712	1
1	2010-01-05 16:52:16.0000002	16.900	2010-01-05 16:52:16 UTC	-74.016	40.711	-73.979	40.782	1
2	2011-08-18 00:35:00.00000049	5.700	2011-08-18 00:35:00 UTC	-73.983	40.761	-73.991	40.751	2
3	2012-04-21 04:30:42.0000001	7.700	2012-04-21 04:30:42 UTC	-73.987	40.733	-73.992	40.758	1
4	2010-03-09 07:51:00.000000135	5.300	2010-03-09 07:51:00 UTC	-73.968	40.768	-73.957	40.784	1