Monday, May 8, 2017

Python Case Study - Predicting Telecom Customer Churn using Logistic Regression, Support Vector Machine & Random Forest


In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
telcochurn=pd.read_csv("PATH to the File with .csv file extension")
In [4]:
telcochurn.head()
Out[4]:
Account Length
International Plan
Voice Mail Plan
Num of Voice mail Messages
Total Day Minutes
Total Day Calls
Total day Charge
Total Eve Minutes
Total Eve Calls
Total Eve Charge
Total Night Minutes
Total Night Calls
Total Night Charge
Total International Minutes
Total Intl Calls
Total Intl Charge
Number Customer Service calls
Churn
0
128
0
1
25
265.1
110
45.07
197.4
99
16.78
244.7
91
11.01
10.0
3
2.70
1
0
1
107
0
1
26
161.6
123
27.47
195.5
103
16.62
254.4
103
11.45
13.7
3
3.70
1
0
2
137
0
0
0
243.4
114
41.38
121.2
110
10.30
162.6
104
7.32
12.2
5
3.29
0
0
3
84
1
0
0
299.4
71
50.90
61.9
88
5.26
196.9
89
8.86
6.6
7
1.78
2
0
4
75
1
0
0
166.7
113
28.34
148.3
122
12.61
186.9
121
8.41
10.1
3
2.73
3
0
In [5]:
telcochurn.columns
Out[5]:
Index(['Account Length', 'International Plan', 'Voice Mail Plan',
       'Num of Voice mail Messages', 'Total Day Minutes', 'Total Day Calls',
       'Total day Charge', 'Total Eve Minutes', 'Total Eve Calls',
       'Total Eve Charge', 'Total Night Minutes', 'Total Night Calls ',
       'Total Night Charge', 'Total International Minutes',
       'Total Intl  Calls', 'Total Intl Charge',
       'Number Customer Service calls ', 'Churn'],
      dtype='object')
In [13]:
feature_cols=telcochurn[['Account Length', 'International Plan', 'Voice Mail Plan',
       'Num of Voice mail Messages', 'Total Day Minutes', 'Total Day Calls',
       'Total day Charge', 'Total Eve Minutes', 'Total Eve Calls',
       'Total Eve Charge', 'Total Night Minutes', 'Total Night Calls ',
       'Total Night Charge', 'Total International Minutes',
       'Total Intl  Calls', 'Total Intl Charge',
       'Number Customer Service calls ']]
In [15]:
from sklearn.linear_model import LogisticRegression
In [16]:
logreg = LogisticRegression()
In [17]:
logreg.fit(feature_cols,telcochurn.Churn)
Out[17]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [18]:
logreg.coef_
Out[18]:
array([[  5.68650239e-04,   2.01019309e+00,  -1.76778749e+00,
          2.38957929e-02,   1.87588040e-02,  -7.72983903e-04,
         -3.71064682e-02,   5.35826051e-03,  -4.93385777e-03,
          4.24187392e-03,   3.19262545e-03,  -4.74561073e-03,
         -7.37187223e-03,   6.27536279e-02,  -8.03434746e-02,
          1.62911927e-02,   4.85563396e-01]])
In [19]:
logreg.intercept_
Out[19]:
array([-6.26440601])
In [20]:
telcochurn["predicted_class"]=logreg.predict(feature_cols)
In [21]:
telcochurn = pd.concat( [telcochurn, pd.DataFrame( logreg.predict_proba(feature_cols ) )], axis = 1) 
In [40]:
telcochurn.head()
Out[40]:
Account Length
International Plan
Voice Mail Plan
Num of Voice mail Messages
Total Day Minutes
Total Day Calls
Total day Charge
Total Eve Minutes
Total Eve Calls
Total Eve Charge
...
Total Night Calls
Total Night Charge
Total International Minutes
Total Intl Calls
Total Intl Charge
Number Customer Service calls
Churn
predicted_class
0
1
0
128
0
1
25
265.1
110
45.07
197.4
99
16.78
...
91
11.01
10.0
3
2.70
1
0
0
0.910662
0.089338
1
107
0
1
26
161.6
123
27.47
195.5
103
16.62
...
103
11.45
13.7
3
3.70
1
0
0
0.968313
0.031687
2
137
0
0
0
243.4
114
41.38
121.2
110
10.30
...
104
7.32
12.2
5
3.29
0
0
0
0.937304
0.062696
3
84
1
0
0
299.4
71
50.90
61.9
88
5.26
...
89
8.86
6.6
7
1.78
2
0
1
0.406369
0.593631
4
75
1
0
0
166.7
113
28.34
148.3
122
12.61
...
121
8.41
10.1
3
2.73
3
0
0
0.530660
0.469340
5 rows × 21 columns
In [23]:
from sklearn import metrics
In [24]:
cm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.predicted_class)
In [25]:
cm
Out[25]:
array([[4201,   92],
       [ 583,  124]])
In [26]:
import seaborn as sn
In [28]:
sn.heatmap(cm,annot=True,fmt='.2f');


In [29]:
score=metrics.accuracy_score(telcochurn.Churn,telcochurn.predicted_class)
In [30]:
score
Out[30]:
0.86499999999999999
In [32]:
from sklearn.cross_validation import cross_val_score
C:\Users\rajeshdgr8\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [33]:
scores=cross_val_score(logreg,feature_cols,telcochurn.Churn,cv=10,scoring='accuracy')
In [34]:
scores
Out[34]:
array([ 0.86227545,  0.87225549,  0.85229541,  0.862     ,  0.864     ,
        0.868     ,  0.856     ,  0.86172345,  0.85971944,  0.8757515 ])
In [35]:
scores.mean()
Out[35]:
0.86340207360829435
In [37]:
print(metrics.classification_report(telcochurn.Churn,telcochurn.predicted_class))
             precision    recall  f1-score   support

          0       0.88      0.98      0.93      4293
          1       0.57      0.18      0.27       707

avg / total       0.84      0.86      0.83      5000

In [38]:
auc_score = metrics.roc_auc_score(telcochurn.Churn,telcochurn.predicted_class)
In [39]:
auc_score
Out[39]:
0.57697936610073108
In [41]:
import statsmodels.api as sm
In [42]:
In [50]:
logit=sm.Logit(telcochurn.Churn,feature_cols)
In [56]:
result=logit.fit()
Optimization terminated successfully.
         Current function value: 0.338345
         Iterations 7
In [59]:
result.summary()
Out[59]:
Logit Regression Results
Dep. Variable:
Churn
No. Observations:
5000
Model:
Logit
Df Residuals:
4983
Method:
MLE
Df Model:
16
Date:
Mon, 08 May 2017
Pseudo R-squ.:
0.1697
Time:
19:17:19
Log-Likelihood:
-1691.7
converged:
True
LL-Null:
-2037.5
LLR p-value:
8.251e-137

coef
std err
z
P>|z|
[0.025
0.975]
Account Length
-0.0019
0.001
-1.762
0.078
-0.004
0.000
International Plan
2.0031
0.118
16.928
0.000
1.771
2.235
Voice Mail Plan
-2.0193
0.463
-4.364
0.000
-2.926
-1.112
Num of Voice mail Messages
0.0310
0.014
2.145
0.032
0.003
0.059
Total Day Minutes
4.0780
2.604
1.566
0.117
-1.025
9.181
Total Day Calls
-0.0111
0.002
-5.468
0.000
-0.015
-0.007
Total day Charge
-23.9328
15.317
-1.563
0.118
-53.953
6.088
Total Eve Minutes
-0.0236
1.304
-0.018
0.986
-2.579
2.532
Total Eve Calls
-0.0154
0.002
-7.504
0.000
-0.019
-0.011
Total Eve Charge
0.2991
15.341
0.019
0.984
-29.768
30.367
Total Night Minutes
0.1664
0.696
0.239
0.811
-1.197
1.530
Total Night Calls
-0.0147
0.002
-7.127
0.000
-0.019
-0.011
Total Night Charge
-3.7082
15.459
-0.240
0.810
-34.007
26.591
Total International Minutes
0.4588
4.214
0.109
0.913
-7.801
8.719
Total Intl Calls
-0.1106
0.020
-5.520
0.000
-0.150
-0.071
Total Intl Charge
-1.6785
15.609
-0.108
0.914
-32.271
28.914
Number Customer Service calls
0.4315
0.031
13.821
0.000
0.370
0.493
In [60]:
from sklearn import svm
In [64]:
SVMmodel = svm.SVC() 
In [65]:
SVMmodel.fit(feature_cols,telcochurn.Churn)
Out[65]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [67]:
telcochurn["SVMpredictedclass"]=SVMmodel.predict(feature_cols)
In [68]:
 
Out[68]:
Account Length
International Plan
Voice Mail Plan
Num of Voice mail Messages
Total Day Minutes
Total Day Calls
Total day Charge
Total Eve Minutes
Total Eve Calls
Total Eve Charge
...
Total Night Charge
Total International Minutes
Total Intl Calls
Total Intl Charge
Number Customer Service calls
Churn
predicted_class
0
1
SVMpredictedclass
0
128
0
1
25
265.1
110
45.07
197.4
99
16.78
...
11.01
10.0
3
2.70
1
0
0
0.910662
0.089338
0
1
107
0
1
26
161.6
123
27.47
195.5
103
16.62
...
11.45
13.7
3
3.70
1
0
0
0.968313
0.031687
0
2
137
0
0
0
243.4
114
41.38
121.2
110
10.30
...
7.32
12.2
5
3.29
0
0
0
0.937304
0.062696
0
3
84
1
0
0
299.4
71
50.90
61.9
88
5.26
...
8.86
6.6
7
1.78
2
0
1
0.406369
0.593631
0
4
75
1
0
0
166.7
113
28.34
148.3
122
12.61
...
8.41
10.1
3
2.73
3
0
0
0.530660
0.469340
0
5 rows × 22 columns
In [69]:
svmcm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.SVMpredictedclass)
In [70]:
svmcm
Out[70]:
array([[4293,    0],
       [   0,  707]])
In [71]:
sn.heatmap(svmcm,annot=True,fmt='.2f');
Description: 
In [72]:
svmscore=metrics.accuracy_score(telcochurn.Churn,telcochurn.SVMpredictedclass)
In [73]:
svmscore
Out[73]:
1.0
In [74]:
svmscores=cross_val_score(SVMmodel,feature_cols,telcochurn.Churn,cv=10,scoring='accuracy')
In [75]:
svmscores
Out[75]:
array([ 0.85828343,  0.85828343,  0.85828343,  0.858     ,  0.858     ,
        0.858     ,  0.858     ,  0.85971944,  0.85971944,  0.85971944])
In [76]:
svmscores.mean()
Out[76]:
0.8586008616034464
In [77]:
print(metrics.classification_report(telcochurn.Churn,telcochurn.SVMpredictedclass))
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      4293
          1       1.00      1.00      1.00       707

avg / total       1.00      1.00      1.00      5000

In [78]:
auc_score = metrics.roc_auc_score(telcochurn.Churn,telcochurn.SVMpredictedclass)
In [79]:
auc_score
Out[79]:
1.0
In [83]:
from sklearn.ensemble import RandomForestClassifier
In [84]:
rf = RandomForestClassifier(n_estimators=100)
In [85]:
rf.fit(feature_cols,telcochurn.Churn)
Out[85]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [86]:
telcochurn["Randfpredictedclass"]=rf.predict(feature_cols)
In [87]:
telcochurn.head()
Out[87]:
Account Length
International Plan
Voice Mail Plan
Num of Voice mail Messages
Total Day Minutes
Total Day Calls
Total day Charge
Total Eve Minutes
Total Eve Calls
Total Eve Charge
...
Total International Minutes
Total Intl Calls
Total Intl Charge
Number Customer Service calls
Churn
predicted_class
0
1
SVMpredictedclass
Randfpredictedclass
0
128
0
1
25
265.1
110
45.07
197.4
99
16.78
...
10.0
3
2.70
1
0
0
0.910662
0.089338
0
0
1
107
0
1
26
161.6
123
27.47
195.5
103
16.62
...
13.7
3
3.70
1
0
0
0.968313
0.031687
0
0
2
137
0
0
0
243.4
114
41.38
121.2
110
10.30
...
12.2
5
3.29
0
0
0
0.937304
0.062696
0
0
3
84
1
0
0
299.4
71
50.90
61.9
88
5.26
...
6.6
7
1.78
2
0
1
0.406369
0.593631
0
0
4
75
1
0
0
166.7
113
28.34
148.3
122
12.61
...
10.1
3
2.73
3
0
0
0.530660
0.469340
0
0
5 rows × 23 columns
In [88]:
RFcm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.Randfpredictedclass)
In [89]:
RFcm
Out[89]:
array([[4293,    0],
       [   0,  707]])
In [ ]:
 

No comments:

Post a Comment