import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('Solarize_Light2')
import folium

import os
os.chdir('D:\Data\Projects\Regression\Taxi Fare Prediction_Linear Regression')

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

import pickle

df = pd.read_csv('train_clean_features.csv')
df.head()

Train und Test¶

x = df.drop(['fare_amount', 'pickup_datetime'], axis=1)
y = df.fare_amount

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3657294, 11), (1219099, 11), (3657294,), (1219099,))

Trainiertes Model laden¶

rfm = pickle.load(open('finalized_rf.sav', 'rb'))

Plotten der Feature Importances¶

fi = pd.DataFrame({'Feature': x.columns, 'Importance':list(rfm.feature_importances_)})

fi = fi.sort_values('Importance', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10,6))
sns.barplot(y=fi.Feature, x=fi.Importance, palette="Reds_d", orient='h');

	fare_amount	pickup_datetime	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	year	month	day	hour	minute	dist
0	4.5	2009-06-15 17:26:21+00:00	-73.844311	40.721319	-73.841610	40.712278	1	2009	6	15	17	26	1.030765
1	16.9	2010-01-05 16:52:16+00:00	-74.016048	40.711303	-73.979268	40.782004	1	2010	1	5	16	52	8.450145
2	5.7	2011-08-18 00:35:00+00:00	-73.982738	40.761270	-73.991242	40.750562	2	2011	8	18	0	35	1.389527
3	7.7	2012-04-21 04:30:42+00:00	-73.987130	40.733143	-73.991567	40.758092	1	2012	4	21	4	30	2.799274
4	5.3	2010-03-09 07:51:00+00:00	-73.968095	40.768008	-73.956655	40.783762	1	2010	3	9	7	51	1.999160