import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import os
os.chdir('D:\Data\Projects\Klassifikation\Telco Customer Churn')
import plotly_express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
from IPython.core.pylabtools import figsize
figsize(10, 10)
plt.rcParams['font.size'] = 15
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head()
df.dtypes.sort_values()
missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing = missing.sort_values('percent', ascending = False)
missing
Es werden keine NaN Werte gefunden, also scheinbar keine fehlenden Werte vorhanden. Jedoch: Umwandeln der Object Spalte "Total Charges" in eine numerische Spalte gab eine Fehlermeldung. Mit astype('float') konnte der Fehler nicht eingegrenzt werden, mit to_numeric wurde der Ort angegeben, wo das Konvertieren scheiterte.
df.loc[488, :]
Es gab leere Felder in TotalCharges, die nicht von df.isnull() erkannt worden waren.
# Füllen der leeren Felder mit 0
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0)
df.TotalCharges = pd.to_numeric(df['TotalCharges'])
df.describe()
# min tenure von 0 scheint ein neu abgeschlossener Vertrag zu sein
df[df['tenure'] == 0]
for i in df.columns:
if (i == 'customerID') or (i == 'tenure') or (i == 'MonthlyCharges') or (i =='TotalCharges'):
pass
else:
print(i.upper());
print(df[i].value_counts());
# Da die Information über Internetservice y/n gegeben ist, kann 'no internetservice' in anderen Spalten mit no ersetzt werden
for i in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport','StreamingTV', 'StreamingMovies']:
df[i] = df[i].replace('No internet service', 'No')
# dito für Phone Service
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')
df.TechSupport.value_counts()
df.head()
df.Churn.value_counts().sort_values(ascending=False).plot(kind='bar', color='green');
df.PaymentMethod.value_counts().sort_values(ascending=False).plot(kind='bar');
sns.catplot(x="PaymentMethod", kind="count", palette="ch:.30", data=df);
#for i in df.columns:
#if (i == 'customerID') or (i == 'tenure') or(i== 'MonthlyCharges') or (i=='TotalCharges'):
# pass
# else:
# sns.catplot(x=i, kind="count", palette="ch:.25", data=df);
target= df['Churn'].value_counts()
levels = ['No','Yes']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green')))
layout = dict(title="Telco Customer Churn Ratio", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)
#Bezahlarten
target= df['PaymentMethod'].value_counts()
levels = ['Electronic check','Mailed check','Bank transfer','Credit card']
trace = go.Pie(labels=target.index,values=target.values)
layout = dict(title="Telco Customer Payment Method", margin=dict(l=50), width=800, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
# Visualisierungen mit Seaborn: KDE für numerische Spalten
def kdeplot(feature):
plt.figure(figsize=(9, 4))
plt.title("KDE for {}".format(feature))
ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
kdeplot('tenure')
kdeplot('MonthlyCharges')
kdeplot('TotalCharges')
px.histogram(df, x='TotalCharges')
px.histogram(df, x='tenure', color='Contract')
px.histogram(df, x='tenure', color='Churn')
px.histogram(df, x='MonthlyCharges', color = 'InternetService' )
px.histogram(df, x='SeniorCitizen', color = 'PaperlessBilling', template= 'plotly_dark')
px.histogram(df, x='MonthlyCharges', color='Churn')
# Log der MonthlyCharges
df['MonthlyCharges_log'] = np.log(df.MonthlyCharges)
px.histogram(df, x='MonthlyCharges_log', color='Churn')
px.histogram(df, x='gender', color='Churn')
px.histogram(df, x='TechSupport', color='Churn')
px.histogram(df, x='Contract', color='Churn')
iplot([go.Scatter(x=[1, 2, 3], y=[3, 1, 6])])
df['tenure_bin'] = pd.cut(df['tenure'], bins = list(range(0, 72, 12))).astype(str)
df.loc[df['tenure_bin'] == 'nan', 'tenure_bin'] = '[60-72]'
df['tenure_bin'].value_counts().sort_index().plot.bar(color = 'b', edgecolor = 'k');
px.histogram(df, x='tenure_bin', color='Churn')
aver = df.groupby(['tenure_bin', 'Churn'])['MonthlyCharges'].mean().reset_index()
aver.tenure_bin = aver.tenure_bin.replace('(0.0, 12.0]', '0-12 months')
aver.tenure_bin = aver.tenure_bin.replace('(12.0, 24.0]', '12-24 months')
aver.tenure_bin = aver.tenure_bin.replace('(24.0, 36.0]', '24-36 months')
aver.tenure_bin = aver.tenure_bin.replace('(36.0, 48.0]', '36-48 months')
aver.tenure_bin = aver.tenure_bin.replace('(48.0, 60.0]', '48-60 months')
aver.tenure_bin = aver.tenure_bin.replace('[60-72]', '60-72 months')
aver
px.bar(aver, x='tenure_bin', y = 'MonthlyCharges', color='Churn', barmode="group")
from sklearn.cluster import KMeans
clu = df[['MonthlyCharges', 'tenure']][df.Churn == 'Yes']
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(clu)
clu['label'] = kmeans.labels_
clu.head()
figsize(10, 10)
plt.scatter(clu['MonthlyCharges'], clu['tenure'], c=clu['label'], cmap='Accent')
plt.xlabel('Monthly Charges')
plt.ylabel('Tenure')
plt.title('3 Clusters of Churners')
plt.show()
df_ = df.drop(['customerID', 'tenure_bin', 'Churn'], axis=1)
df_= pd.get_dummies(df_, prefix_sep='_', drop_first=True)
df_.shape
df_.head()
df_['Churn'] = df.Churn
df_['Churn'] = df_['Churn'].replace({'Yes': 1, 'No': 0})
df_.head()
df_.corr()['Churn'].sort_values()
# Heatmap der Korrelationen im Datensatz
corrs = df_.corr()
plt.figure(figsize = (20, 20))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f', cmap='viridis');
#df_.to_csv('df_clean.csv', sep=',', encoding= 'utf-8', index=False)