In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('Solarize_Light2')
import folium

import os
os.chdir('D:\Data\Projects\Regression\Taxi Fare Prediction_Linear Regression')

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

import pickle
In [2]:
df = pd.read_csv('train_clean_features.csv')
df.head()
Out[2]:
fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count year month day hour minute dist
0 4.5 2009-06-15 17:26:21+00:00 -73.844311 40.721319 -73.841610 40.712278 1 2009 6 15 17 26 1.030765
1 16.9 2010-01-05 16:52:16+00:00 -74.016048 40.711303 -73.979268 40.782004 1 2010 1 5 16 52 8.450145
2 5.7 2011-08-18 00:35:00+00:00 -73.982738 40.761270 -73.991242 40.750562 2 2011 8 18 0 35 1.389527
3 7.7 2012-04-21 04:30:42+00:00 -73.987130 40.733143 -73.991567 40.758092 1 2012 4 21 4 30 2.799274
4 5.3 2010-03-09 07:51:00+00:00 -73.968095 40.768008 -73.956655 40.783762 1 2010 3 9 7 51 1.999160

Train und Test

In [3]:
x = df.drop(['fare_amount', 'pickup_datetime'], axis=1)
y = df.fare_amount
In [4]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[4]:
((3657294, 11), (1219099, 11), (3657294,), (1219099,))

Trainiertes Model laden

In [5]:
rfm = pickle.load(open('finalized_rf.sav', 'rb'))

Plotten der Feature Importances

In [6]:
fi = pd.DataFrame({'Feature': x.columns, 'Importance':list(rfm.feature_importances_)})
In [7]:
fi = fi.sort_values('Importance', ascending=False).reset_index(drop=True)
In [8]:
plt.figure(figsize=(10,6))
sns.barplot(y=fi.Feature, x=fi.Importance, palette="Reds_d", orient='h');