Diamonds Teil II: Modeling¶

Der Datensatz enthält Preise und Eigenschafter von 54000 Diamanten.
Aus den Eigenschaften des Diamanten soll sein Preis vorhergesagt werden können.
Quelle: https://www.kaggle.com/shivam2503/diamonds
Herangehensweise: Exploration und Cleaning, dann Modeling
Algorithmus:

Lineare Regression mit Regularisation
Random Forest Regressor

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('Solarize_Light2')
from IPython.core.pylabtools import figsize
figsize(10, 8)

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

from sklearn.preprocessing import LabelEncoder

import os

# Zeige alle Spalten an
pd.set_option('display.max_columns', None)
# Ändere Arbeitsordner
os.chdir('D:\Data\Projects\Regression\Diamanten Preise_Regularization')

Lineare Regression mit den am besten korrelierenden Features¶

from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

Datensatz einlesen¶

df = pd.read_csv('df_clean.csv')

Datensatz aufteilen¶

x = df.drop('price', axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40440, 24), (13480, 24), (40440,), (13480,))

Linear Regression¶

lr = LinearRegression()

features =['carat', 'x', 'y', 'z', 'clarity_SI2', 'table', 'color_E', 'cut_Ideal', 'color_I']

lr.fit(X_train[features], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

y_preds = lr.predict(X_test[features])

mean_absolute_error(y_test, y_preds)

815.0618852014234

Linear Regression mit allen Features¶

lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

y_pred = lr.predict(X_test)

mean_absolute_error(y_test, y_pred)

734.3630291127556

Linear Regression mit allen außer den hoch kollinearen Features x, y, z¶

df.columns

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'carat_log',
       'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E',
       'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF',
       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',
       'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')

features_nonkol = ['carat', 'depth', 'table', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2']
lr.fit(X_train[features_nonkol], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

y_pred_nk = lr.predict(X_test[features_nonkol])

mean_absolute_error(y_test, y_pred_nk)

805.0816963599364

Linear Regression mit allen Features hat die besten Ergebnisse¶

Linear Regression mit Regularization¶

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

lasso = Lasso(normalize=True)
lasso.fit(X_train_sc, y_train)
y_pred_lasso = lasso.predict(X_test_sc)

mean_absolute_error(y_test, y_pred_lasso)

897.0268170692644

rid = Ridge(normalize=True)
rid.fit(X_train_sc, y_train)
y_pred_rid = rid.predict(X_test_sc)

mean_absolute_error(y_test, y_pred_rid)

1068.0264367394489

Regularisierung verschlechtert die Performance¶

Linear Regression mit Polynomial Featues¶

Generiert Polynomial und interaction Features
Generiert eine neue Features Matrix mit allen polynomial Kombinationen der Features mit dem Degree gleich und geringer als gegebenem Degree

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

y_pred_poly = lr_poly.predict(X_test_poly)

mean_absolute_error(y_test, y_pred_poly)

433.5131837363767

Da nicht alle Beziehungen linear sind, könnte es besser sein, ein nicht lineares Modell zu verwenden¶

Random Forest mit allen Features¶

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mean_absolute_error(y_test, y_pred_rf)

297.885075904338

feature_importance_values = rf.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': feature_importance_values})
feature_importances.head(10)

Hyperparameter Tuning Random Forest Model mit GridSearch und Cross Validation¶

rf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

from sklearn.model_selection import RandomizedSearchCV

# Bäume
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Variablen in jedem Split
max_features = ['auto', 'sqrt']
# Levels im Baum
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Instanzen für Split
min_samples_split = [2, 5, 10]
# Instanzen für Blatt
min_samples_leaf = [1, 2, 4]

# Grid erstellen
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

random_grid

{'n_estimators': [200, 650, 1100, 1550, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 35, 60, 85, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4]}

# Finde beste Kombination und verwende alle vorhandene Rechenkapazität
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Das Model fitten
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 49.1min finished

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                                   n_jobs=None, oob_score=False,
                                                   random_state=None, verbose=0,
                                                   warm_start=False),
                   iid='warn', n_iter=50, n_jobs=-1,
                   param_distributions={'max_depth': [10, 35, 60, 85, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 650, 1100, 1550,
                                                         2000]},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

best_params = rf_random.best_params_
best_params

{'n_estimators': 1100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 110}

best_random = rf_random.best_estimator_

Das Model ist bereits mit den besten Parametern gefittet, daher müssen sie nicht nochmals trainiert werden. Es kann gleich mit den besten Parametern die Vorhersage gemacht werden.

#best_random.fit(X_train,y_train)
y_pred_rf_best = best_random.predict(X_test)
mean_absolute_error(y_test, y_pred_rf_best)

296.3651882251135

Ergebnis¶

Das lineare Modell funktioniert am besten mit Polynomial Features, aber noch besser ist es, Random Forest zu verwenden. Der MAE ist immer noch ziemlich hoch.

	feature	importance
0	carat	0.302161
1	depth	0.006478
2	table	0.003137
3	x	0.005853
4	y	0.294088
5	z	0.005933
6	carat_log	0.291535
7	cut_Good	0.000411
8	cut_Ideal	0.001054
9	cut_Premium	0.000444