In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np
import os

os.chdir('D:\Data\Projects\Klassifikation\Telco Customer Churn')

import plotly_express as px

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

from IPython.core.pylabtools import figsize
figsize(10, 10)
plt.rcParams['font.size'] = 15
In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head()
(7043, 21)
Out[2]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
In [3]:
df.dtypes.sort_values()
Out[3]:
SeniorCitizen         int64
tenure                int64
MonthlyCharges      float64
customerID           object
PaymentMethod        object
PaperlessBilling     object
Contract             object
StreamingMovies      object
StreamingTV          object
TechSupport          object
DeviceProtection     object
OnlineBackup         object
OnlineSecurity       object
InternetService      object
MultipleLines        object
PhoneService         object
Dependents           object
Partner              object
gender               object
TotalCharges         object
Churn                object
dtype: object
In [4]:
missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing = missing.sort_values('percent', ascending = False)
missing
Out[4]:
total percent

Es werden keine NaN Werte gefunden, also scheinbar keine fehlenden Werte vorhanden. Jedoch: Umwandeln der Object Spalte "Total Charges" in eine numerische Spalte gab eine Fehlermeldung. Mit astype('float') konnte der Fehler nicht eingegrenzt werden, mit to_numeric wurde der Ort angegeben, wo das Konvertieren scheiterte.

In [5]:
df.loc[488, :]
Out[5]:
customerID                         4472-LVYGI
gender                                 Female
SeniorCitizen                               0
Partner                                   Yes
Dependents                                Yes
tenure                                      0
PhoneService                               No
MultipleLines                No phone service
InternetService                           DSL
OnlineSecurity                            Yes
OnlineBackup                               No
DeviceProtection                          Yes
TechSupport                               Yes
StreamingTV                               Yes
StreamingMovies                            No
Contract                             Two year
PaperlessBilling                          Yes
PaymentMethod       Bank transfer (automatic)
MonthlyCharges                          52.55
TotalCharges                                 
Churn                                      No
Name: 488, dtype: object

Es gab leere Felder in TotalCharges, die nicht von df.isnull() erkannt worden waren.

In [6]:
# Füllen der leeren Felder mit 0
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0)
In [7]:
df.TotalCharges = pd.to_numeric(df['TotalCharges'])
In [8]:
df.describe()
Out[8]:
SeniorCitizen tenure MonthlyCharges TotalCharges
count 7043.000000 7043.000000 7043.000000 7043.000000
mean 0.162147 32.371149 64.761692 2279.734304
std 0.368612 24.559481 30.090047 2266.794470
min 0.000000 0.000000 18.250000 0.000000
25% 0.000000 9.000000 35.500000 398.550000
50% 0.000000 29.000000 70.350000 1394.550000
75% 0.000000 55.000000 89.850000 3786.600000
max 1.000000 72.000000 118.750000 8684.800000
In [9]:
# min tenure von 0 scheint ein neu abgeschlossener Vertrag zu sein
df[df['tenure'] == 0]
Out[9]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
488 4472-LVYGI Female 0 Yes Yes 0 No No phone service DSL Yes No Yes Yes Yes No Two year Yes Bank transfer (automatic) 52.55 0.0 No
753 3115-CZMZD Male 0 No Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 20.25 0.0 No
936 5709-LVOEQ Female 0 Yes Yes 0 Yes No DSL Yes Yes Yes No Yes Yes Two year No Mailed check 80.85 0.0 No
1082 4367-NUYAO Male 0 Yes Yes 0 Yes Yes No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 25.75 0.0 No
1340 1371-DWPAZ Female 0 Yes Yes 0 No No phone service DSL Yes Yes Yes Yes Yes No Two year No Credit card (automatic) 56.05 0.0 No
3331 7644-OMVMY Male 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 19.85 0.0 No
3826 3213-VVOLG Male 0 Yes Yes 0 Yes Yes No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 25.35 0.0 No
4380 2520-SGTTA Female 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 20.00 0.0 No
5218 2923-ARZLG Male 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service One year Yes Mailed check 19.70 0.0 No
6670 4075-WKNIU Female 0 Yes Yes 0 Yes Yes DSL No Yes Yes Yes Yes No Two year No Mailed check 73.35 0.0 No
6754 2775-SEFEE Male 0 No Yes 0 Yes Yes DSL Yes Yes No Yes No No Two year Yes Bank transfer (automatic) 61.90 0.0 No
In [10]:
for i in df.columns:
    if (i == 'customerID') or (i == 'tenure') or (i == 'MonthlyCharges') or (i =='TotalCharges'):
        pass
    else:
        print(i.upper());
        print(df[i].value_counts());
GENDER
Male      3555
Female    3488
Name: gender, dtype: int64
SENIORCITIZEN
0    5901
1    1142
Name: SeniorCitizen, dtype: int64
PARTNER
No     3641
Yes    3402
Name: Partner, dtype: int64
DEPENDENTS
No     4933
Yes    2110
Name: Dependents, dtype: int64
PHONESERVICE
Yes    6361
No      682
Name: PhoneService, dtype: int64
MULTIPLELINES
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64
INTERNETSERVICE
Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64
ONLINESECURITY
No                     3498
Yes                    2019
No internet service    1526
Name: OnlineSecurity, dtype: int64
ONLINEBACKUP
No                     3088
Yes                    2429
No internet service    1526
Name: OnlineBackup, dtype: int64
DEVICEPROTECTION
No                     3095
Yes                    2422
No internet service    1526
Name: DeviceProtection, dtype: int64
TECHSUPPORT
No                     3473
Yes                    2044
No internet service    1526
Name: TechSupport, dtype: int64
STREAMINGTV
No                     2810
Yes                    2707
No internet service    1526
Name: StreamingTV, dtype: int64
STREAMINGMOVIES
No                     2785
Yes                    2732
No internet service    1526
Name: StreamingMovies, dtype: int64
CONTRACT
Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64
PAPERLESSBILLING
Yes    4171
No     2872
Name: PaperlessBilling, dtype: int64
PAYMENTMETHOD
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: PaymentMethod, dtype: int64
CHURN
No     5174
Yes    1869
Name: Churn, dtype: int64
In [11]:
# Da die Information über Internetservice y/n gegeben ist, kann 'no internetservice' in anderen Spalten mit no ersetzt werden
for i in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport','StreamingTV', 'StreamingMovies']:
    df[i] = df[i].replace('No internet service', 'No')
In [12]:
# dito für Phone Service
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')
In [13]:
df.TechSupport.value_counts()
Out[13]:
No     4999
Yes    2044
Name: TechSupport, dtype: int64
In [14]:
df.head()
Out[14]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.50 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

Visualisierungen

In [15]:
df.Churn.value_counts().sort_values(ascending=False).plot(kind='bar', color='green');

Das Dataset ist imbalanced, es gibt 2,7 Mal so viele no churn wie yes churn. Dies wird beim Modling entsprechend in Betracht gezogen.

In [16]:
df.PaymentMethod.value_counts().sort_values(ascending=False).plot(kind='bar');
In [17]:
sns.catplot(x="PaymentMethod", kind="count", palette="ch:.30", data=df);
In [18]:
#for i in df.columns:
    #if (i == 'customerID') or (i == 'tenure') or(i== 'MonthlyCharges') or (i=='TotalCharges'):
      #  pass
   # else:
      #  sns.catplot(x=i, kind="count", palette="ch:.25", data=df);

Visualisierung mit plotly

In [19]:
target= df['Churn'].value_counts()
levels = ['No','Yes']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green')))
layout = dict(title="Telco Customer Churn Ratio", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)
No     5174
Yes    1869
Name: Churn, dtype: int64
In [20]:
#Bezahlarten
target= df['PaymentMethod'].value_counts()
levels = ['Electronic check','Mailed check','Bank transfer','Credit card']
trace = go.Pie(labels=target.index,values=target.values)
layout = dict(title="Telco Customer Payment Method", margin=dict(l=50), width=800, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
In [21]:
# Visualisierungen mit Seaborn: KDE für numerische Spalten
def kdeplot(feature):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
    ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
    
kdeplot('tenure')
kdeplot('MonthlyCharges')
kdeplot('TotalCharges')

Visualsisierung mit ploty express

In [22]:
px.histogram(df, x='TotalCharges')
In [23]:
px.histogram(df, x='tenure', color='Contract')
In [24]:
px.histogram(df, x='tenure', color='Churn')
In [25]:
px.histogram(df, x='MonthlyCharges', color = 'InternetService' )
In [26]:
px.histogram(df, x='SeniorCitizen', color = 'PaperlessBilling', template= 'plotly_dark')
In [27]:
px.histogram(df, x='MonthlyCharges', color='Churn')
In [28]:
# Log der MonthlyCharges
df['MonthlyCharges_log'] = np.log(df.MonthlyCharges)
In [29]:
px.histogram(df, x='MonthlyCharges_log', color='Churn')
In [30]:
px.histogram(df, x='gender', color='Churn')
In [31]:
px.histogram(df, x='TechSupport', color='Churn')
In [32]:
px.histogram(df, x='Contract', color='Churn')
In [33]:
iplot([go.Scatter(x=[1, 2, 3], y=[3, 1, 6])])
In [34]:
df['tenure_bin'] = pd.cut(df['tenure'], bins = list(range(0, 72, 12))).astype(str)
df.loc[df['tenure_bin'] == 'nan', 'tenure_bin'] = '[60-72]'
In [35]:
df['tenure_bin'].value_counts().sort_index().plot.bar(color = 'b', edgecolor = 'k');
In [36]:
px.histogram(df, x='tenure_bin', color='Churn')

Bins erstellen für die Visualisierung mit plotly express

In [37]:
aver = df.groupby(['tenure_bin', 'Churn'])['MonthlyCharges'].mean().reset_index()
aver.tenure_bin = aver.tenure_bin.replace('(0.0, 12.0]', '0-12 months')
aver.tenure_bin = aver.tenure_bin.replace('(12.0, 24.0]', '12-24 months')
aver.tenure_bin = aver.tenure_bin.replace('(24.0, 36.0]', '24-36 months')
aver.tenure_bin = aver.tenure_bin.replace('(36.0, 48.0]', '36-48 months')
aver.tenure_bin = aver.tenure_bin.replace('(48.0, 60.0]', '48-60 months')
aver.tenure_bin = aver.tenure_bin.replace('[60-72]', '60-72 months')
aver
Out[37]:
tenure_bin Churn MonthlyCharges
0 0-12 months No 46.766169
1 0-12 months Yes 66.493973
2 12-24 months No 54.449589
3 12-24 months Yes 78.509014
4 24-36 months No 60.415414
5 24-36 months Yes 84.266389
6 36-48 months No 61.977229
7 36-48 months Yes 84.790000
8 48-60 months No 67.579143
9 48-60 months Yes 88.182500
10 60-72 months No 74.166415
11 60-72 months Yes 97.317742
In [38]:
px.bar(aver, x='tenure_bin', y = 'MonthlyCharges', color='Churn', barmode="group")

Cluster Analyse wesentlicher Faktoren

In [39]:
from sklearn.cluster import KMeans
In [40]:
clu = df[['MonthlyCharges', 'tenure']][df.Churn == 'Yes']
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(clu)
In [41]:
clu['label'] = kmeans.labels_
clu.head()
Out[41]:
MonthlyCharges tenure label
2 53.85 2 0
4 70.70 2 2
5 99.65 8 2
8 104.80 28 1
13 103.70 49 1
In [42]:
figsize(10, 10)
plt.scatter(clu['MonthlyCharges'], clu['tenure'], c=clu['label'], cmap='Accent')
plt.xlabel('Monthly Charges')
plt.ylabel('Tenure')
plt.title('3 Clusters of Churners')
plt.show()

Kategorische Variablen in numerische umwandeln

In [43]:
df_ = df.drop(['customerID', 'tenure_bin', 'Churn'], axis=1)
In [44]:
df_= pd.get_dummies(df_, prefix_sep='_', drop_first=True)
In [45]:
df_.shape
Out[45]:
(7043, 24)
In [46]:
df_.head()
Out[46]:
SeniorCitizen tenure MonthlyCharges TotalCharges MonthlyCharges_log gender_Male Partner_Yes Dependents_Yes PhoneService_Yes MultipleLines_Yes InternetService_Fiber optic InternetService_No OnlineSecurity_Yes OnlineBackup_Yes DeviceProtection_Yes TechSupport_Yes StreamingTV_Yes StreamingMovies_Yes Contract_One year Contract_Two year PaperlessBilling_Yes PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check
0 0 1 29.85 29.85 3.396185 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0
1 0 34 56.95 1889.50 4.042174 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1
2 0 2 53.85 108.15 3.986202 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1
3 0 45 42.30 1840.75 3.744787 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0
4 0 2 70.70 151.65 4.258446 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0
In [47]:
df_['Churn'] = df.Churn
In [48]:
df_['Churn'] = df_['Churn'].replace({'Yes': 1, 'No': 0})
In [49]:
df_.head()
Out[49]:
SeniorCitizen tenure MonthlyCharges TotalCharges MonthlyCharges_log gender_Male Partner_Yes Dependents_Yes PhoneService_Yes MultipleLines_Yes InternetService_Fiber optic InternetService_No OnlineSecurity_Yes OnlineBackup_Yes DeviceProtection_Yes TechSupport_Yes StreamingTV_Yes StreamingMovies_Yes Contract_One year Contract_Two year PaperlessBilling_Yes PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check Churn
0 0 1 29.85 29.85 3.396185 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0
1 0 34 56.95 1889.50 4.042174 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0
2 0 2 53.85 108.15 3.986202 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1
3 0 45 42.30 1840.75 3.744787 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0
4 0 2 70.70 151.65 4.258446 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1

Korrelation berechnen

In [50]:
df_.corr()['Churn'].sort_values()
Out[50]:
tenure                                  -0.352229
Contract_Two year                       -0.302253
InternetService_No                      -0.227890
TotalCharges                            -0.198324
Contract_One year                       -0.177820
OnlineSecurity_Yes                      -0.171226
TechSupport_Yes                         -0.164674
Dependents_Yes                          -0.164221
Partner_Yes                             -0.150448
PaymentMethod_Credit card (automatic)   -0.134302
PaymentMethod_Mailed check              -0.091683
OnlineBackup_Yes                        -0.082255
DeviceProtection_Yes                    -0.066160
gender_Male                             -0.008612
PhoneService_Yes                         0.011942
MultipleLines_Yes                        0.040102
StreamingMovies_Yes                      0.061382
StreamingTV_Yes                          0.063228
SeniorCitizen                            0.150889
PaperlessBilling_Yes                     0.191825
MonthlyCharges                           0.193356
MonthlyCharges_log                       0.210326
PaymentMethod_Electronic check           0.301919
InternetService_Fiber optic              0.308020
Churn                                    1.000000
Name: Churn, dtype: float64
In [51]:
# Heatmap der Korrelationen im Datensatz
corrs = df_.corr()
plt.figure(figsize = (20, 20))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f', cmap='viridis');

Datensatz speichern

In [52]:
#df_.to_csv('df_clean.csv', sep=',', encoding= 'utf-8', index=False)