Der Datensatz enthält Preise und Eigenschafter von 54000 Diamanten.
Aus den Eigenschaften des Diamanten soll sein Preis vorhergesagt werden können.
Quelle: https://www.kaggle.com/shivam2503/diamonds
Herangehensweise: Exploration und Cleaning, dann Modeling
Algorithmus:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('Solarize_Light2')
from IPython.core.pylabtools import figsize
figsize(10, 8)
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
from sklearn.preprocessing import LabelEncoder
import os
# Zeige alle Spalten an
pd.set_option('display.max_columns', None)
# Ändere Arbeitsordner
os.chdir('D:\Data\Projects\Regression\Diamanten Preise_Regularization')
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
df = pd.read_csv('df_clean.csv')
x = df.drop('price', axis=1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
lr = LinearRegression()
features =['carat', 'x', 'y', 'z', 'clarity_SI2', 'table', 'color_E', 'cut_Ideal', 'color_I']
lr.fit(X_train[features], y_train)
y_preds = lr.predict(X_test[features])
mean_absolute_error(y_test, y_preds)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)
df.columns
features_nonkol = ['carat', 'depth', 'table', 'cut_Good',
'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
'clarity_VVS2']
lr.fit(X_train[features_nonkol], y_train)
y_pred_nk = lr.predict(X_test[features_nonkol])
mean_absolute_error(y_test, y_pred_nk)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)
lasso = Lasso(normalize=True)
lasso.fit(X_train_sc, y_train)
y_pred_lasso = lasso.predict(X_test_sc)
mean_absolute_error(y_test, y_pred_lasso)
rid = Ridge(normalize=True)
rid.fit(X_train_sc, y_train)
y_pred_rid = rid.predict(X_test_sc)
mean_absolute_error(y_test, y_pred_rid)
Generiert Polynomial und interaction Features
Generiert eine neue Features Matrix mit allen polynomial Kombinationen der Features mit dem Degree gleich und geringer als gegebenem Degree
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)
y_pred_poly = lr_poly.predict(X_test_poly)
mean_absolute_error(y_test, y_pred_poly)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mean_absolute_error(y_test, y_pred_rf)
feature_importance_values = rf.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': feature_importance_values})
feature_importances.head(10)
rf.get_params()
from sklearn.model_selection import RandomizedSearchCV
# Bäume
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Variablen in jedem Split
max_features = ['auto', 'sqrt']
# Levels im Baum
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Instanzen für Split
min_samples_split = [2, 5, 10]
# Instanzen für Blatt
min_samples_leaf = [1, 2, 4]
# Grid erstellen
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
random_grid
# Finde beste Kombination und verwende alle vorhandene Rechenkapazität
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50,
cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Das Model fitten
rf_random.fit(X_train, y_train)
best_params = rf_random.best_params_
best_params
best_random = rf_random.best_estimator_
Das Model ist bereits mit den besten Parametern gefittet, daher müssen sie nicht nochmals trainiert werden. Es kann gleich mit den besten Parametern die Vorhersage gemacht werden.
#best_random.fit(X_train,y_train)
y_pred_rf_best = best_random.predict(X_test)
mean_absolute_error(y_test, y_pred_rf_best)
Das lineare Modell funktioniert am besten mit Polynomial Features, aber noch besser ist es, Random Forest zu verwenden. Der MAE ist immer noch ziemlich hoch.