import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np
import os

os.chdir('D:\Data\Projects\Klassifikation\Telco Customer Churn')

import plotly_express as px

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
plt.style.use('Solarize_Light2')

import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

from IPython.core.pylabtools import figsize
figsize(10, 10)
plt.rcParams['font.size'] = 15

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head()

(7043, 21)

df.dtypes.sort_values()

SeniorCitizen         int64
tenure                int64
MonthlyCharges      float64
customerID           object
PaymentMethod        object
PaperlessBilling     object
Contract             object
StreamingMovies      object
StreamingTV          object
TechSupport          object
DeviceProtection     object
OnlineBackup         object
OnlineSecurity       object
InternetService      object
MultipleLines        object
PhoneService         object
Dependents           object
Partner              object
gender               object
TotalCharges         object
Churn                object
dtype: object

missing = pd.DataFrame(df.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(df)*100
missing = missing[missing.total != 0]
missing = missing.sort_values('percent', ascending = False)
missing

Es werden keine NaN Werte gefunden, also scheinbar keine fehlenden Werte vorhanden. Jedoch: Umwandeln der Object Spalte "Total Charges" in eine numerische Spalte gab eine Fehlermeldung. Mit astype('float') konnte der Fehler nicht eingegrenzt werden, mit to_numeric wurde der Ort angegeben, wo das Konvertieren scheiterte.

df.loc[488, :]

customerID                         4472-LVYGI
gender                                 Female
SeniorCitizen                               0
Partner                                   Yes
Dependents                                Yes
tenure                                      0
PhoneService                               No
MultipleLines                No phone service
InternetService                           DSL
OnlineSecurity                            Yes
OnlineBackup                               No
DeviceProtection                          Yes
TechSupport                               Yes
StreamingTV                               Yes
StreamingMovies                            No
Contract                             Two year
PaperlessBilling                          Yes
PaymentMethod       Bank transfer (automatic)
MonthlyCharges                          52.55
TotalCharges                                 
Churn                                      No
Name: 488, dtype: object

Es gab leere Felder in TotalCharges, die nicht von df.isnull() erkannt worden waren.

# Füllen der leeren Felder mit 0
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0)

df.TotalCharges = pd.to_numeric(df['TotalCharges'])

df.describe()

# min tenure von 0 scheint ein neu abgeschlossener Vertrag zu sein
df[df['tenure'] == 0]

for i in df.columns:
    if (i == 'customerID') or (i == 'tenure') or (i == 'MonthlyCharges') or (i =='TotalCharges'):
        pass
    else:
        print(i.upper());
        print(df[i].value_counts());

GENDER
Male      3555
Female    3488
Name: gender, dtype: int64
SENIORCITIZEN
0    5901
1    1142
Name: SeniorCitizen, dtype: int64
PARTNER
No     3641
Yes    3402
Name: Partner, dtype: int64
DEPENDENTS
No     4933
Yes    2110
Name: Dependents, dtype: int64
PHONESERVICE
Yes    6361
No      682
Name: PhoneService, dtype: int64
MULTIPLELINES
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64
INTERNETSERVICE
Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64
ONLINESECURITY
No                     3498
Yes                    2019
No internet service    1526
Name: OnlineSecurity, dtype: int64
ONLINEBACKUP
No                     3088
Yes                    2429
No internet service    1526
Name: OnlineBackup, dtype: int64
DEVICEPROTECTION
No                     3095
Yes                    2422
No internet service    1526
Name: DeviceProtection, dtype: int64
TECHSUPPORT
No                     3473
Yes                    2044
No internet service    1526
Name: TechSupport, dtype: int64
STREAMINGTV
No                     2810
Yes                    2707
No internet service    1526
Name: StreamingTV, dtype: int64
STREAMINGMOVIES
No                     2785
Yes                    2732
No internet service    1526
Name: StreamingMovies, dtype: int64
CONTRACT
Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64
PAPERLESSBILLING
Yes    4171
No     2872
Name: PaperlessBilling, dtype: int64
PAYMENTMETHOD
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: PaymentMethod, dtype: int64
CHURN
No     5174
Yes    1869
Name: Churn, dtype: int64

# Da die Information über Internetservice y/n gegeben ist, kann 'no internetservice' in anderen Spalten mit no ersetzt werden
for i in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport','StreamingTV', 'StreamingMovies']:
    df[i] = df[i].replace('No internet service', 'No')

# dito für Phone Service
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')

df.TechSupport.value_counts()

No     4999
Yes    2044
Name: TechSupport, dtype: int64

df.head()

Visualisierungen¶

df.Churn.value_counts().sort_values(ascending=False).plot(kind='bar', color='green');

Das Dataset ist imbalanced, es gibt 2,7 Mal so viele no churn wie yes churn. Dies wird beim Modling entsprechend in Betracht gezogen.¶

df.PaymentMethod.value_counts().sort_values(ascending=False).plot(kind='bar');

sns.catplot(x="PaymentMethod", kind="count", palette="ch:.30", data=df);

#for i in df.columns:
    #if (i == 'customerID') or (i == 'tenure') or(i== 'MonthlyCharges') or (i=='TotalCharges'):
      #  pass
   # else:
      #  sns.catplot(x=i, kind="count", palette="ch:.25", data=df);

Visualisierung mit plotly¶

target= df['Churn'].value_counts()
levels = ['No','Yes']
trace = go.Pie(labels=target.index,values=target.values, marker=dict(colors=('orange','green')))
layout = dict(title="Telco Customer Churn Ratio", margin=dict(l=150), width=500, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)
#print target class counts
print(target)

No     5174
Yes    1869
Name: Churn, dtype: int64

#Bezahlarten
target= df['PaymentMethod'].value_counts()
levels = ['Electronic check','Mailed check','Bank transfer','Credit card']
trace = go.Pie(labels=target.index,values=target.values)
layout = dict(title="Telco Customer Payment Method", margin=dict(l=50), width=800, height=500)
figdata = [trace]
fig = go.Figure(data=figdata, layout=layout)
iplot(fig)

# Visualisierungen mit Seaborn: KDE für numerische Spalten
def kdeplot(feature):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
    ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
    
kdeplot('tenure')
kdeplot('MonthlyCharges')
kdeplot('TotalCharges')

Visualsisierung mit ploty express¶

px.histogram(df, x='TotalCharges')

px.histogram(df, x='tenure', color='Contract')

px.histogram(df, x='tenure', color='Churn')

px.histogram(df, x='MonthlyCharges', color = 'InternetService' )

px.histogram(df, x='SeniorCitizen', color = 'PaperlessBilling', template= 'plotly_dark')

px.histogram(df, x='MonthlyCharges', color='Churn')

# Log der MonthlyCharges
df['MonthlyCharges_log'] = np.log(df.MonthlyCharges)

px.histogram(df, x='MonthlyCharges_log', color='Churn')

px.histogram(df, x='gender', color='Churn')

px.histogram(df, x='TechSupport', color='Churn')

px.histogram(df, x='Contract', color='Churn')

iplot([go.Scatter(x=[1, 2, 3], y=[3, 1, 6])])

df['tenure_bin'] = pd.cut(df['tenure'], bins = list(range(0, 72, 12))).astype(str)
df.loc[df['tenure_bin'] == 'nan', 'tenure_bin'] = '[60-72]'

df['tenure_bin'].value_counts().sort_index().plot.bar(color = 'b', edgecolor = 'k');

px.histogram(df, x='tenure_bin', color='Churn')

Bins erstellen für die Visualisierung mit plotly express¶

aver = df.groupby(['tenure_bin', 'Churn'])['MonthlyCharges'].mean().reset_index()
aver.tenure_bin = aver.tenure_bin.replace('(0.0, 12.0]', '0-12 months')
aver.tenure_bin = aver.tenure_bin.replace('(12.0, 24.0]', '12-24 months')
aver.tenure_bin = aver.tenure_bin.replace('(24.0, 36.0]', '24-36 months')
aver.tenure_bin = aver.tenure_bin.replace('(36.0, 48.0]', '36-48 months')
aver.tenure_bin = aver.tenure_bin.replace('(48.0, 60.0]', '48-60 months')
aver.tenure_bin = aver.tenure_bin.replace('[60-72]', '60-72 months')
aver

px.bar(aver, x='tenure_bin', y = 'MonthlyCharges', color='Churn', barmode="group")

Cluster Analyse wesentlicher Faktoren¶

from sklearn.cluster import KMeans

clu = df[['MonthlyCharges', 'tenure']][df.Churn == 'Yes']
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(clu)

clu['label'] = kmeans.labels_
clu.head()

figsize(10, 10)
plt.scatter(clu['MonthlyCharges'], clu['tenure'], c=clu['label'], cmap='Accent')
plt.xlabel('Monthly Charges')
plt.ylabel('Tenure')
plt.title('3 Clusters of Churners')
plt.show()

Kategorische Variablen in numerische umwandeln¶

df_ = df.drop(['customerID', 'tenure_bin', 'Churn'], axis=1)

df_= pd.get_dummies(df_, prefix_sep='_', drop_first=True)

df_.shape

(7043, 24)

df_.head()

df_['Churn'] = df.Churn

df_['Churn'] = df_['Churn'].replace({'Yes': 1, 'No': 0})

df_.head()

Korrelation berechnen¶

df_.corr()['Churn'].sort_values()

tenure                                  -0.352229
Contract_Two year                       -0.302253
InternetService_No                      -0.227890
TotalCharges                            -0.198324
Contract_One year                       -0.177820
OnlineSecurity_Yes                      -0.171226
TechSupport_Yes                         -0.164674
Dependents_Yes                          -0.164221
Partner_Yes                             -0.150448
PaymentMethod_Credit card (automatic)   -0.134302
PaymentMethod_Mailed check              -0.091683
OnlineBackup_Yes                        -0.082255
DeviceProtection_Yes                    -0.066160
gender_Male                             -0.008612
PhoneService_Yes                         0.011942
MultipleLines_Yes                        0.040102
StreamingMovies_Yes                      0.061382
StreamingTV_Yes                          0.063228
SeniorCitizen                            0.150889
PaperlessBilling_Yes                     0.191825
MonthlyCharges                           0.193356
MonthlyCharges_log                       0.210326
PaymentMethod_Electronic check           0.301919
InternetService_Fiber optic              0.308020
Churn                                    1.000000
Name: Churn, dtype: float64

# Heatmap der Korrelationen im Datensatz
corrs = df_.corr()
plt.figure(figsize = (20, 20))
sns.heatmap(corrs, annot = True, vmin = -1, vmax = 1, fmt = '.3f', cmap='viridis');

Datensatz speichern¶

#df_.to_csv('df_clean.csv', sep=',', encoding= 'utf-8', index=False)

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	7043.000000	7043.000000	7043.000000	7043.000000
mean	0.162147	32.371149	64.761692	2279.734304
std	0.368612	24.559481	30.090047	2266.794470
min	0.000000	0.000000	18.250000	0.000000
25%	0.000000	9.000000	35.500000	398.550000
50%	0.000000	29.000000	70.350000	1394.550000
75%	0.000000	55.000000	89.850000	3786.600000
max	1.000000	72.000000	118.750000	8684.800000

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	Yes	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	No	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

	customerID	gender	Partner	Dependents	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	Churn
488	4472-LVYGI	Female	Yes	Yes	No	No phone service	DSL	Yes	No	Yes	Yes	Yes	No	Two year	Yes	Bank transfer (automatic)	52.55	No
753	3115-CZMZD	Male	No	Yes	Yes	No	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	20.25	No
936	5709-LVOEQ	Female	Yes	Yes	Yes	No	DSL	Yes	Yes	Yes	No	Yes	Yes	Two year	No	Mailed check	80.85	No
1082	4367-NUYAO	Male	Yes	Yes	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	25.75	No
1340	1371-DWPAZ	Female	Yes	Yes	No	No phone service	DSL	Yes	Yes	Yes	Yes	Yes	No	Two year	No	Credit card (automatic)	56.05	No
3331	7644-OMVMY	Male	Yes	Yes	Yes	No	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	19.85	No
3826	3213-VVOLG	Male	Yes	Yes	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	25.35	No
4380	2520-SGTTA	Female	Yes	Yes	Yes	No	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	20.00	No
5218	2923-ARZLG	Male	Yes	Yes	Yes	No	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	One year	Yes	Mailed check	19.70	No
6670	4075-WKNIU	Female	Yes	Yes	Yes	Yes	DSL	No	Yes	Yes	Yes	Yes	No	Two year	No	Mailed check	73.35	No
6754	2775-SEFEE	Male	No	Yes	Yes	Yes	DSL	Yes	Yes	No	Yes	No	No	Two year	Yes	Bank transfer (automatic)	61.90	No

	tenure_bin	Churn	MonthlyCharges
0	0-12 months	No	46.766169
1	0-12 months	Yes	66.493973
2	12-24 months	No	54.449589
3	12-24 months	Yes	78.509014
4	24-36 months	No	60.415414
5	24-36 months	Yes	84.266389
6	36-48 months	No	61.977229
7	36-48 months	Yes	84.790000
8	48-60 months	No	67.579143
9	48-60 months	Yes	88.182500
10	60-72 months	No	74.166415
11	60-72 months	Yes	97.317742

	tenure	MonthlyCharges	TotalCharges	MonthlyCharges_log	gender_Male	Partner_Yes	PhoneService_Yes	InternetService_Fiber optic	OnlineSecurity_Yes	OnlineBackup_Yes	DeviceProtection_Yes	TechSupport_Yes	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check
0	1	29.85	29.85	3.396185	0	1	0	0	0	1	0	0	0	1	1	0
1	34	56.95	1889.50	4.042174	1	0	1	0	1	0	1	0	1	0	0	1
2	2	53.85	108.15	3.986202	1	0	1	0	1	1	0	0	0	1	0	1
3	45	42.30	1840.75	3.744787	1	0	0	0	1	0	1	1	1	0	0	0
4	2	70.70	151.65	4.258446	0	0	1	1	0	0	0	0	0	1	1	0

	MonthlyCharges	tenure	label
2	53.85	2	0
4	70.70	2	2
5	99.65	8	2
8	104.80	28	1
13	103.70	49	1