DataAdvanceR Labs: 2017

In [ ]:

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:

telcochurn=pd.read_csv("PATH to the File with .csv file extension")

In [4]:

telcochurn.head()

Out[4]:

	Account Length	International Plan	Voice Mail Plan	Num of Voice mail Messages	Total Day Minutes	Total Day Calls	Total day Charge	Total Eve Minutes	Total Eve Calls	Total Eve Charge	Total Night Minutes	Total Night Calls	Total Night Charge	Total International Minutes	Total Intl Calls	Total Intl Charge	Number Customer Service calls	Churn
0	128	0	1	25	265.1	110	45.07	197.4	99	16.78	244.7	91	11.01	10.0	3	2.70	1	0
1	107	0	1	26	161.6	123	27.47	195.5	103	16.62	254.4	103	11.45	13.7	3	3.70	1	0
2	137	0	0	0	243.4	114	41.38	121.2	110	10.30	162.6	104	7.32	12.2	5	3.29	0	0
3	84	1	0	0	299.4	71	50.90	61.9	88	5.26	196.9	89	8.86	6.6	7	1.78	2	0
4	75	1	0	0	166.7	113	28.34	148.3	122	12.61	186.9	121	8.41	10.1	3	2.73	3	0

In [5]:

telcochurn.columns

Out[5]:

Index(['Account Length', 'International Plan', 'Voice Mail Plan',

'Num of Voice mail Messages', 'Total Day Minutes', 'Total Day Calls',

'Total day Charge', 'Total Eve Minutes', 'Total Eve Calls',

'Total Eve Charge', 'Total Night Minutes', 'Total Night Calls ',

'Total Night Charge', 'Total International Minutes',

'Total Intl Calls', 'Total Intl Charge',

'Number Customer Service calls ', 'Churn'],

dtype='object')

In [13]:

feature_cols=telcochurn[['Account Length', 'International Plan', 'Voice Mail Plan',

       'Num of Voice mail Messages', 'Total Day Minutes', 'Total Day Calls',

       'Total day Charge', 'Total Eve Minutes', 'Total Eve Calls',

       'Total Eve Charge', 'Total Night Minutes', 'Total Night Calls ',

       'Total Night Charge', 'Total International Minutes',

       'Total Intl  Calls', 'Total Intl Charge',

       'Number Customer Service calls ']]

In [15]:

from sklearn.linear_model import LogisticRegression

In [16]:

logreg = LogisticRegression()

In [17]:

logreg.fit(feature_cols,telcochurn.Churn)

Out[17]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,

intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,

penalty='l2', random_state=None, solver='liblinear', tol=0.0001,

verbose=0, warm_start=False)

In [18]:

logreg.coef_

Out[18]:

array([[ 5.68650239e-04, 2.01019309e+00, -1.76778749e+00,

2.38957929e-02, 1.87588040e-02, -7.72983903e-04,

-3.71064682e-02, 5.35826051e-03, -4.93385777e-03,

4.24187392e-03, 3.19262545e-03, -4.74561073e-03,

-7.37187223e-03, 6.27536279e-02, -8.03434746e-02,

1.62911927e-02, 4.85563396e-01]])

In [19]:

logreg.intercept_

Out[19]:

array([-6.26440601])

In [20]:

telcochurn["predicted_class"]=logreg.predict(feature_cols)

In [21]:

telcochurn = pd.concat( [telcochurn, pd.DataFrame( logreg.predict_proba(feature_cols ) )], axis = 1)

In [40]:

telcochurn.head()

Out[40]:

	Account Length	International Plan	Voice Mail Plan	Num of Voice mail Messages	Total Day Minutes	Total Day Calls	Total day Charge	Total Eve Minutes	Total Eve Calls	Total Eve Charge	...	Total Night Calls	Total Night Charge	Total International Minutes	Total Intl Calls	Total Intl Charge	Number Customer Service calls	Churn	predicted_class	0	1
0	128	0	1	25	265.1	110	45.07	197.4	99	16.78	...	91	11.01	10.0	3	2.70	1	0	0	0.910662	0.089338
1	107	0	1	26	161.6	123	27.47	195.5	103	16.62	...	103	11.45	13.7	3	3.70	1	0	0	0.968313	0.031687
2	137	0	0	0	243.4	114	41.38	121.2	110	10.30	...	104	7.32	12.2	5	3.29	0	0	0	0.937304	0.062696
3	84	1	0	0	299.4	71	50.90	61.9	88	5.26	...	89	8.86	6.6	7	1.78	2	0	1	0.406369	0.593631
4	75	1	0	0	166.7	113	28.34	148.3	122	12.61	...	121	8.41	10.1	3	2.73	3	0	0	0.530660	0.469340

5 rows × 21 columns

In [23]:

from sklearn import metrics

In [24]:

cm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.predicted_class)

In [25]:

cm

Out[25]:

array([[4201, 92],

[ 583, 124]])

In [26]:

import seaborn as sn

In [28]:

sn.heatmap(cm,annot=True,fmt='.2f');

In [29]:

score=metrics.accuracy_score(telcochurn.Churn,telcochurn.predicted_class)

In [30]:

score

Out[30]:

0.86499999999999999

In [32]:

from sklearn.cross_validation import cross_val_score

C:\Users\rajeshdgr8\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

"This module will be removed in 0.20.", DeprecationWarning)

In [33]:

scores=cross_val_score(logreg,feature_cols,telcochurn.Churn,cv=10,scoring='accuracy')

In [34]:

scores

Out[34]:

array([ 0.86227545, 0.87225549, 0.85229541, 0.862 , 0.864 ,

0.868 , 0.856 , 0.86172345, 0.85971944, 0.8757515 ])

In [35]:

scores.mean()

Out[35]:

0.86340207360829435

In [37]:

print(metrics.classification_report(telcochurn.Churn,telcochurn.predicted_class))

precision recall f1-score support

0 0.88 0.98 0.93 4293

1 0.57 0.18 0.27 707

avg / total 0.84 0.86 0.83 5000

In [38]:

auc_score = metrics.roc_auc_score(telcochurn.Churn,telcochurn.predicted_class)

In [39]:

auc_score

Out[39]:

0.57697936610073108

In [41]:

import statsmodels.api as sm

In [42]:

In [50]:

logit=sm.Logit(telcochurn.Churn,feature_cols)

In [56]:

result=logit.fit()

Optimization terminated successfully.

Current function value: 0.338345

Iterations 7

In [59]:

result.summary()

Out[59]:

Logit Regression Results
Dep. Variable:	Churn	No. Observations:	5000
Model:	Logit	Df Residuals:	4983
Method:	MLE	Df Model:	16
Date:	Mon, 08 May 2017	Pseudo R-squ.:	0.1697
Time:	19:17:19	Log-Likelihood:	-1691.7
converged:	True	LL-Null:	-2037.5
		LLR p-value:	8.251e-137

	coef	std err	z	P>\|z\|	[0.025	0.975]
Account Length	-0.0019	0.001	-1.762	0.078	-0.004	0.000
International Plan	2.0031	0.118	16.928	0.000	1.771	2.235
Voice Mail Plan	-2.0193	0.463	-4.364	0.000	-2.926	-1.112
Num of Voice mail Messages	0.0310	0.014	2.145	0.032	0.003	0.059
Total Day Minutes	4.0780	2.604	1.566	0.117	-1.025	9.181
Total Day Calls	-0.0111	0.002	-5.468	0.000	-0.015	-0.007
Total day Charge	-23.9328	15.317	-1.563	0.118	-53.953	6.088
Total Eve Minutes	-0.0236	1.304	-0.018	0.986	-2.579	2.532
Total Eve Calls	-0.0154	0.002	-7.504	0.000	-0.019	-0.011
Total Eve Charge	0.2991	15.341	0.019	0.984	-29.768	30.367
Total Night Minutes	0.1664	0.696	0.239	0.811	-1.197	1.530
Total Night Calls	-0.0147	0.002	-7.127	0.000	-0.019	-0.011
Total Night Charge	-3.7082	15.459	-0.240	0.810	-34.007	26.591
Total International Minutes	0.4588	4.214	0.109	0.913	-7.801	8.719
Total Intl Calls	-0.1106	0.020	-5.520	0.000	-0.150	-0.071
Total Intl Charge	-1.6785	15.609	-0.108	0.914	-32.271	28.914
Number Customer Service calls	0.4315	0.031	13.821	0.000	0.370	0.493

In [60]:

from sklearn import svm

In [64]:

SVMmodel = svm.SVC()

In [65]:

SVMmodel.fit(feature_cols,telcochurn.Churn)

Out[65]:

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,

decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',

max_iter=-1, probability=False, random_state=None, shrinking=True,

tol=0.001, verbose=False)

In [67]:

telcochurn["SVMpredictedclass"]=SVMmodel.predict(feature_cols)

In [68]:

Out[68]:

	Account Length	International Plan	Voice Mail Plan	Num of Voice mail Messages	Total Day Minutes	Total Day Calls	Total day Charge	Total Eve Minutes	Total Eve Calls	Total Eve Charge	...	Total Night Charge	Total International Minutes	Total Intl Calls	Total Intl Charge	Number Customer Service calls	Churn	predicted_class	0	1	SVMpredictedclass
0	128	0	1	25	265.1	110	45.07	197.4	99	16.78	...	11.01	10.0	3	2.70	1	0	0	0.910662	0.089338	0
1	107	0	1	26	161.6	123	27.47	195.5	103	16.62	...	11.45	13.7	3	3.70	1	0	0	0.968313	0.031687	0
2	137	0	0	0	243.4	114	41.38	121.2	110	10.30	...	7.32	12.2	5	3.29	0	0	0	0.937304	0.062696	0
3	84	1	0	0	299.4	71	50.90	61.9	88	5.26	...	8.86	6.6	7	1.78	2	0	1	0.406369	0.593631	0
4	75	1	0	0	166.7	113	28.34	148.3	122	12.61	...	8.41	10.1	3	2.73	3	0	0	0.530660	0.469340	0

5 rows × 22 columns

In [69]:

svmcm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.SVMpredictedclass)

In [70]:

svmcm

Out[70]:

array([[4293, 0],

[ 0, 707]])

In [71]:

sn.heatmap(svmcm,annot=True,fmt='.2f');

In [72]:

svmscore=metrics.accuracy_score(telcochurn.Churn,telcochurn.SVMpredictedclass)

In [73]:

svmscore

Out[73]:

1.0

In [74]:

svmscores=cross_val_score(SVMmodel,feature_cols,telcochurn.Churn,cv=10,scoring='accuracy')

In [75]:

svmscores

Out[75]:

array([ 0.85828343, 0.85828343, 0.85828343, 0.858 , 0.858 ,

0.858 , 0.858 , 0.85971944, 0.85971944, 0.85971944])

In [76]:

svmscores.mean()

Out[76]:

0.8586008616034464

In [77]:

print(metrics.classification_report(telcochurn.Churn,telcochurn.SVMpredictedclass))

precision recall f1-score support

0 1.00 1.00 1.00 4293

1 1.00 1.00 1.00 707

avg / total 1.00 1.00 1.00 5000

In [78]:

auc_score = metrics.roc_auc_score(telcochurn.Churn,telcochurn.SVMpredictedclass)

In [79]:

auc_score

Out[79]:

1.0

In [83]:

from sklearn.ensemble import RandomForestClassifier

In [84]:

rf = RandomForestClassifier(n_estimators=100)

In [85]:

rf.fit(feature_cols,telcochurn.Churn)

Out[85]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',

max_depth=None, max_features='auto', max_leaf_nodes=None,

min_impurity_split=1e-07, min_samples_leaf=1,

min_samples_split=2, min_weight_fraction_leaf=0.0,

n_estimators=100, n_jobs=1, oob_score=False, random_state=None,

verbose=0, warm_start=False)

In [86]:

telcochurn["Randfpredictedclass"]=rf.predict(feature_cols)

In [87]:

telcochurn.head()

Out[87]:

	Account Length	International Plan	Voice Mail Plan	Num of Voice mail Messages	Total Day Minutes	Total Day Calls	Total day Charge	Total Eve Minutes	Total Eve Calls	Total Eve Charge	...	Total International Minutes	Total Intl Calls	Total Intl Charge	Number Customer Service calls	Churn	predicted_class	0	1	SVMpredictedclass	Randfpredictedclass
0	128	0	1	25	265.1	110	45.07	197.4	99	16.78	...	10.0	3	2.70	1	0	0	0.910662	0.089338	0	0
1	107	0	1	26	161.6	123	27.47	195.5	103	16.62	...	13.7	3	3.70	1	0	0	0.968313	0.031687	0	0
2	137	0	0	0	243.4	114	41.38	121.2	110	10.30	...	12.2	5	3.29	0	0	0	0.937304	0.062696	0	0
3	84	1	0	0	299.4	71	50.90	61.9	88	5.26	...	6.6	7	1.78	2	0	1	0.406369	0.593631	0	0
4	75	1	0	0	166.7	113	28.34	148.3	122	12.61	...	10.1	3	2.73	3	0	0	0.530660	0.469340	0	0

5 rows × 23 columns

In [88]:

RFcm=metrics.confusion_matrix(telcochurn.Churn,telcochurn.Randfpredictedclass)

In [89]:

RFcm

Out[89]:

array([[4293, 0],

[ 0, 707]])

In [ ]:

DataAdvanceR Labs

Monday, May 8, 2017

Python Case Study - Predicting Telecom Customer Churn using Logistic Regression, Support Vector Machine & Random Forest