In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
telcochurn=pd.read_csv("PATH to the File with .csv file extension")
In [4]:
telcochurn.head()
Out[4]:
Account Length
|
International Plan
|
Voice Mail Plan
|
Num of Voice mail Messages
|
Total Day Minutes
|
Total Day Calls
|
Total day Charge
|
Total Eve Minutes
|
Total Eve Calls
|
Total Eve Charge
|
Total Night Minutes
|
Total Night Calls
|
Total Night Charge
|
Total International Minutes
|
Total Intl Calls
|
Total Intl Charge
|
Number Customer Service calls
|
Churn
|
|
0
|
128
|
0
|
1
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
244.7
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
1
|
107
|
0
|
1
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
254.4
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
2
|
137
|
0
|
0
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
162.6
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
3
|
84
|
1
|
0
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
196.9
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
4
|
75
|
1
|
0
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
186.9
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
In [5]:
telcochurn.columns
Out[5]:
Index(['Account Length', 'International
Plan', 'Voice Mail Plan',
'Num of Voice mail Messages', 'Total Day Minutes', 'Total Day Calls',
'Total day Charge', 'Total Eve Minutes', 'Total Eve Calls',
'Total Eve Charge', 'Total Night Minutes', 'Total Night Calls ',
'Total Night Charge', 'Total International Minutes',
'Total Intl Calls', 'Total Intl
Charge',
'Number Customer Service calls ', 'Churn'],
dtype='object')
In [13]:
feature_cols=telcochurn[['Account Length', 'International Plan', 'Voice Mail Plan',
'Num of Voice mail Messages', 'Total Day Minutes', 'Total Day Calls',
'Total day Charge', 'Total Eve Minutes', 'Total Eve Calls',
'Total Eve Charge', 'Total Night Minutes', 'Total Night Calls ',
'Total Night Charge', 'Total International Minutes',
'Total Intl Calls', 'Total Intl Charge',
'Number Customer Service calls ']]
In [15]:
from sklearn.linear_model import LogisticRegression
In [16]:
logreg = LogisticRegression()
In [17]:
logreg.fit(feature_cols,telcochurn.Churn)
Out[17]:
LogisticRegression(C=1.0,
class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
In [18]:
logreg.coef_
Out[18]:
array([[
5.68650239e-04,
2.01019309e+00, -1.76778749e+00,
2.38957929e-02,
1.87588040e-02, -7.72983903e-04,
-3.71064682e-02,
5.35826051e-03, -4.93385777e-03,
4.24187392e-03,
3.19262545e-03, -4.74561073e-03,
-7.37187223e-03,
6.27536279e-02, -8.03434746e-02,
1.62911927e-02,
4.85563396e-01]])
In [19]:
logreg.intercept_
Out[19]:
array([-6.26440601])
In [20]:
telcochurn["predicted_class"]=logreg.predict(feature_cols)
In [21]:
telcochurn = pd.concat( [telcochurn, pd.DataFrame( logreg.predict_proba(feature_cols ) )], axis = 1)
In [40]:
telcochurn.head()
Out[40]:
Account
Length
|
International
Plan
|
Voice
Mail Plan
|
Num
of Voice mail Messages
|
Total
Day Minutes
|
Total
Day Calls
|
Total
day Charge
|
Total
Eve Minutes
|
Total
Eve Calls
|
Total
Eve Charge
|
...
|
Total
Night Calls
|
Total
Night Charge
|
Total
International Minutes
|
Total
Intl Calls
|
Total
Intl Charge
|
Number
Customer Service calls
|
Churn
|
predicted_class
|
0
|
1
|
|
0
|
128
|
0
|
1
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
...
|
91
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
0
|
0.910662
|
0.089338
|
1
|
107
|
0
|
1
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
...
|
103
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
0
|
0.968313
|
0.031687
|
2
|
137
|
0
|
0
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
...
|
104
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
0
|
0.937304
|
0.062696
|
3
|
84
|
1
|
0
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
...
|
89
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
1
|
0.406369
|
0.593631
|
4
|
75
|
1
|
0
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
...
|
121
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
0
|
0.530660
|
0.469340
|
5 rows × 21 columns
In [23]:
from sklearn import metrics
In [24]:
cm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.predicted_class)
In [25]:
cm
Out[25]:
array([[4201, 92],
[ 583, 124]])
In [26]:
import seaborn as sn
In [28]:
sn.heatmap(cm,annot=True,fmt='.2f');
In [29]:
score=metrics.accuracy_score(telcochurn.Churn,telcochurn.predicted_class)
In [30]:
score
Out[30]:
0.86499999999999999
In [32]:
from sklearn.cross_validation import cross_val_score
C:\Users\rajeshdgr8\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44:
DeprecationWarning: This module was deprecated in version 0.18 in favor of the
model_selection module into which all the refactored classes and functions are
moved. Also note that the interface of the new CV iterators are different from
that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
In [33]:
scores=cross_val_score(logreg,feature_cols,telcochurn.Churn,cv=10,scoring='accuracy')
In [34]:
scores
Out[34]:
array([ 0.86227545, 0.87225549,
0.85229541, 0.862 ,
0.864 ,
0.868 , 0.856
, 0.86172345, 0.85971944,
0.8757515 ])
In [35]:
scores.mean()
Out[35]:
0.86340207360829435
In [37]:
print(metrics.classification_report(telcochurn.Churn,telcochurn.predicted_class))
precision recall
f1-score support
0 0.88 0.98
0.93 4293
1 0.57 0.18
0.27 707
avg / total 0.84
0.86 0.83 5000
In [38]:
auc_score = metrics.roc_auc_score(telcochurn.Churn,telcochurn.predicted_class)
In [39]:
auc_score
Out[39]:
0.57697936610073108
In [41]:
import statsmodels.api as sm
In [42]:
In [50]:
logit=sm.Logit(telcochurn.Churn,feature_cols)
In [56]:
result=logit.fit()
Optimization terminated successfully.
Current function value: 0.338345
Iterations 7
In [59]:
result.summary()
Out[59]:
Logit Regression
Results
|
|||
Dep. Variable:
|
Churn
|
No. Observations:
|
5000
|
Model:
|
Logit
|
Df Residuals:
|
4983
|
Method:
|
MLE
|
Df Model:
|
16
|
Date:
|
Mon, 08 May 2017
|
Pseudo R-squ.:
|
0.1697
|
Time:
|
19:17:19
|
Log-Likelihood:
|
-1691.7
|
converged:
|
True
|
LL-Null:
|
-2037.5
|
LLR p-value:
|
8.251e-137
|
||
coef
|
std err
|
z
|
P>|z|
|
[0.025
|
0.975]
|
|
Account Length
|
-0.0019
|
0.001
|
-1.762
|
0.078
|
-0.004
|
0.000
|
International Plan
|
2.0031
|
0.118
|
16.928
|
0.000
|
1.771
|
2.235
|
Voice Mail Plan
|
-2.0193
|
0.463
|
-4.364
|
0.000
|
-2.926
|
-1.112
|
Num of Voice mail
Messages
|
0.0310
|
0.014
|
2.145
|
0.032
|
0.003
|
0.059
|
Total Day Minutes
|
4.0780
|
2.604
|
1.566
|
0.117
|
-1.025
|
9.181
|
Total Day Calls
|
-0.0111
|
0.002
|
-5.468
|
0.000
|
-0.015
|
-0.007
|
Total day Charge
|
-23.9328
|
15.317
|
-1.563
|
0.118
|
-53.953
|
6.088
|
Total Eve Minutes
|
-0.0236
|
1.304
|
-0.018
|
0.986
|
-2.579
|
2.532
|
Total Eve Calls
|
-0.0154
|
0.002
|
-7.504
|
0.000
|
-0.019
|
-0.011
|
Total Eve Charge
|
0.2991
|
15.341
|
0.019
|
0.984
|
-29.768
|
30.367
|
Total Night Minutes
|
0.1664
|
0.696
|
0.239
|
0.811
|
-1.197
|
1.530
|
Total Night Calls
|
-0.0147
|
0.002
|
-7.127
|
0.000
|
-0.019
|
-0.011
|
Total Night Charge
|
-3.7082
|
15.459
|
-0.240
|
0.810
|
-34.007
|
26.591
|
Total International
Minutes
|
0.4588
|
4.214
|
0.109
|
0.913
|
-7.801
|
8.719
|
Total Intl Calls
|
-0.1106
|
0.020
|
-5.520
|
0.000
|
-0.150
|
-0.071
|
Total Intl Charge
|
-1.6785
|
15.609
|
-0.108
|
0.914
|
-32.271
|
28.914
|
Number Customer
Service calls
|
0.4315
|
0.031
|
13.821
|
0.000
|
0.370
|
0.493
|
In [60]:
from sklearn import svm
In [64]:
SVMmodel = svm.SVC()
In [65]:
SVMmodel.fit(feature_cols,telcochurn.Churn)
Out[65]:
SVC(C=1.0, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
In [67]:
telcochurn["SVMpredictedclass"]=SVMmodel.predict(feature_cols)
In [68]:
Out[68]:
Account
Length
|
International
Plan
|
Voice
Mail Plan
|
Num
of Voice mail Messages
|
Total
Day Minutes
|
Total
Day Calls
|
Total
day Charge
|
Total
Eve Minutes
|
Total
Eve Calls
|
Total
Eve Charge
|
...
|
Total
Night Charge
|
Total
International Minutes
|
Total
Intl Calls
|
Total
Intl Charge
|
Number
Customer Service calls
|
Churn
|
predicted_class
|
0
|
1
|
SVMpredictedclass
|
|
0
|
128
|
0
|
1
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
...
|
11.01
|
10.0
|
3
|
2.70
|
1
|
0
|
0
|
0.910662
|
0.089338
|
0
|
1
|
107
|
0
|
1
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
...
|
11.45
|
13.7
|
3
|
3.70
|
1
|
0
|
0
|
0.968313
|
0.031687
|
0
|
2
|
137
|
0
|
0
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
...
|
7.32
|
12.2
|
5
|
3.29
|
0
|
0
|
0
|
0.937304
|
0.062696
|
0
|
3
|
84
|
1
|
0
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
...
|
8.86
|
6.6
|
7
|
1.78
|
2
|
0
|
1
|
0.406369
|
0.593631
|
0
|
4
|
75
|
1
|
0
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
...
|
8.41
|
10.1
|
3
|
2.73
|
3
|
0
|
0
|
0.530660
|
0.469340
|
0
|
5 rows × 22 columns
In [69]:
svmcm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.SVMpredictedclass)
In [70]:
svmcm
Out[70]:
array([[4293, 0],
[ 0, 707]])
In [71]:
sn.heatmap(svmcm,annot=True,fmt='.2f');
In [72]:
svmscore=metrics.accuracy_score(telcochurn.Churn,telcochurn.SVMpredictedclass)
In [73]:
svmscore
Out[73]:
1.0
In [74]:
svmscores=cross_val_score(SVMmodel,feature_cols,telcochurn.Churn,cv=10,scoring='accuracy')
In [75]:
svmscores
Out[75]:
array([ 0.85828343, 0.85828343,
0.85828343, 0.858 ,
0.858 ,
0.858 , 0.858
, 0.85971944, 0.85971944,
0.85971944])
In [76]:
svmscores.mean()
Out[76]:
0.8586008616034464
In [77]:
print(metrics.classification_report(telcochurn.Churn,telcochurn.SVMpredictedclass))
precision recall
f1-score support
0 1.00 1.00
1.00 4293
1 1.00 1.00
1.00 707
avg / total 1.00
1.00 1.00 5000
In [78]:
auc_score = metrics.roc_auc_score(telcochurn.Churn,telcochurn.SVMpredictedclass)
In [79]:
auc_score
Out[79]:
1.0
In [83]:
from sklearn.ensemble import RandomForestClassifier
In [84]:
rf = RandomForestClassifier(n_estimators=100)
In [85]:
rf.fit(feature_cols,telcochurn.Churn)
Out[85]:
RandomForestClassifier(bootstrap=True,
class_weight=None, criterion='gini',
max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=1,
oob_score=False, random_state=None,
verbose=0, warm_start=False)
In [86]:
telcochurn["Randfpredictedclass"]=rf.predict(feature_cols)
In [87]:
telcochurn.head()
Out[87]:
Account
Length
|
International
Plan
|
Voice
Mail Plan
|
Num
of Voice mail Messages
|
Total
Day Minutes
|
Total
Day Calls
|
Total
day Charge
|
Total
Eve Minutes
|
Total
Eve Calls
|
Total
Eve Charge
|
...
|
Total
International Minutes
|
Total
Intl Calls
|
Total
Intl Charge
|
Number
Customer Service calls
|
Churn
|
predicted_class
|
0
|
1
|
SVMpredictedclass
|
Randfpredictedclass
|
|
0
|
128
|
0
|
1
|
25
|
265.1
|
110
|
45.07
|
197.4
|
99
|
16.78
|
...
|
10.0
|
3
|
2.70
|
1
|
0
|
0
|
0.910662
|
0.089338
|
0
|
0
|
1
|
107
|
0
|
1
|
26
|
161.6
|
123
|
27.47
|
195.5
|
103
|
16.62
|
...
|
13.7
|
3
|
3.70
|
1
|
0
|
0
|
0.968313
|
0.031687
|
0
|
0
|
2
|
137
|
0
|
0
|
0
|
243.4
|
114
|
41.38
|
121.2
|
110
|
10.30
|
...
|
12.2
|
5
|
3.29
|
0
|
0
|
0
|
0.937304
|
0.062696
|
0
|
0
|
3
|
84
|
1
|
0
|
0
|
299.4
|
71
|
50.90
|
61.9
|
88
|
5.26
|
...
|
6.6
|
7
|
1.78
|
2
|
0
|
1
|
0.406369
|
0.593631
|
0
|
0
|
4
|
75
|
1
|
0
|
0
|
166.7
|
113
|
28.34
|
148.3
|
122
|
12.61
|
...
|
10.1
|
3
|
2.73
|
3
|
0
|
0
|
0.530660
|
0.469340
|
0
|
0
|
5 rows × 23 columns
In [88]:
RFcm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.Randfpredictedclass)
In [89]:
RFcm
Out[89]:
array([[4293, 0],
[ 0, 707]])
In [ ]: