Wednesday, October 17, 2018

Python for Data Science - Basics, Descriptive Statistics, Hypothesis Testing,& Regression Models


Python PGDM 201719 nmims
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
mtcars=pd.read_csv("/Users/Rajesh Prabhakar/Desktop/Datasets/mtcars.csv")
In [4]:
# The first commands to run on data import:
#     dfname.head() First 5 rows of data
#     dfname.tail() Last t5 rows of data
#     dfname.shape  First no of rows & Second no of columns
#     dfname.dtypes Individual Data Type of each variable
In [5]:
mtcars.head()
#head() # Function predefined & Paranthesis or brackets round must
Out[5]:
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
In [6]:
mtcars.tail() # last five rows of dataframe
Out[6]:
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
27 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
28 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.5 0 1 5 4
29 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.5 0 1 5 6
30 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.6 0 1 5 8
31 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.6 1 1 4 2
In [7]:
mtcars.shape # Number of rows and Number of columns
Out[7]:
(32, 12)
In [8]:
mtcars.dtypes
Out[8]:
Unnamed: 0     object
mpg           float64
cyl             int64
disp          float64
hp              int64
drat          float64
wt            float64
qsec          float64
vs              int64
am              int64
gear            int64
carb            int64
dtype: object
In [9]:
# The following Big Data Data Types:
#     a) Numerical - Integer & Float
#     b) String or Character or Text (" ")
#     c) Boolean or Logical (True,False)
#     d) Images - jpeg, png, tiff, satellite, 
#     e) Geo Spatial - latitude, longitudes, GPS, maps
#     f) Internet of Things (Sensor based Data like Temperature)
#     g) Social Media - Updates, Videos, Pictures, Tweets, Networks
#     h) High Frequency Data - very fast updating stock prices, currency exchange rates
#     i) Streaming Data - Tweets, Videos, Pics, Traffic
#     j) Traffic Data - Web or Road Traffic
In [10]:
# BIG DATA ANALYTICS - DESCRIPTIVE ANALYTICS - UNIVARIATE STATISTICS
# MEASURES OF CENTRAL TENDENCY - MEAN, MEDIAN, MODE
# MEASURES OF DISPERSION - RANGE, VARIANCE, STANDARD DEVIATION, QUARTILES, PERCENTILES
# MEASURES OF ASSYMMETRY - SKEWNESS & KURTOSIS 
# MEASURES OF RELATIONSHIP - COVARIANCE & CORRELATION (BIVARIATE STATISTICS)
In [11]:
mtcars.describe().transpose()
Out[11]:
count mean std min 25% 50% 75% max
mpg 32.0 20.090625 6.026948 10.400 15.42500 19.200 22.80 33.900
cyl 32.0 6.187500 1.785922 4.000 4.00000 6.000 8.00 8.000
disp 32.0 230.721875 123.938694 71.100 120.82500 196.300 326.00 472.000
hp 32.0 146.687500 68.562868 52.000 96.50000 123.000 180.00 335.000
drat 32.0 3.596563 0.534679 2.760 3.08000 3.695 3.92 4.930
wt 32.0 3.217250 0.978457 1.513 2.58125 3.325 3.61 5.424
qsec 32.0 17.848750 1.786943 14.500 16.89250 17.710 18.90 22.900
vs 32.0 0.437500 0.504016 0.000 0.00000 0.000 1.00 1.000
am 32.0 0.406250 0.498991 0.000 0.00000 0.000 1.00 1.000
gear 32.0 3.687500 0.737804 3.000 3.00000 4.000 4.00 5.000
carb 32.0 2.812500 1.615200 1.000 2.00000 2.000 4.00 8.000
In [12]:
mtcars.var() # Variance
Out[12]:
mpg        36.324103
cyl         3.189516
disp    15360.799829
hp       4700.866935
drat        0.285881
wt          0.957379
qsec        3.193166
vs          0.254032
am          0.248992
gear        0.544355
carb        2.608871
dtype: float64
In [13]:
mtcars.skew()
# Positive Skewness Correction - Logarthmic or Square
# Negative Skewness Correction - Exponential or Power 
Out[13]:
mpg     0.672377
cyl    -0.192261
disp    0.420233
hp      0.799407
drat    0.292780
wt      0.465916
qsec    0.406347
vs      0.264542
am      0.400809
gear    0.582309
carb    1.157091
dtype: float64
In [14]:
mtcars.kurt() # kurtosis
Out[14]:
mpg    -0.022006
cyl    -1.762794
disp   -1.067523
hp      0.275212
drat   -0.450432
wt      0.416595
qsec    0.864931
vs     -2.063273
am     -1.966550
gear   -0.895292
carb    2.020059
dtype: float64
In [15]:
mtcars.cov() # Covariance
Out[15]:
mpg cyl disp hp drat wt qsec vs am gear carb
mpg 36.324103 -9.172379 -633.097208 -320.732056 2.195064 -5.116685 4.509149 2.017137 1.803931 2.135685 -5.363105
cyl -9.172379 3.189516 199.660282 101.931452 -0.668367 1.367371 -1.886855 -0.729839 -0.465726 -0.649194 1.520161
disp -633.097208 199.660282 15360.799829 6721.158669 -47.064019 107.684204 -96.051681 -44.377621 -36.564012 -50.802621 79.068750
hp -320.732056 101.931452 6721.158669 4700.866935 -16.451109 44.192661 -86.770081 -24.987903 -8.320565 -6.358871 83.036290
drat 2.195064 -0.668367 -47.064019 -16.451109 0.285881 -0.372721 0.087141 0.118649 0.190151 0.275988 -0.078407
wt -5.116685 1.367371 107.684204 44.192661 -0.372721 0.957379 -0.305482 -0.273661 -0.338105 -0.421081 0.675790
qsec 4.509149 -1.886855 -96.051681 -86.770081 0.087141 -0.305482 3.193166 0.670565 -0.204960 -0.280403 -1.894113
vs 2.017137 -0.729839 -44.377621 -24.987903 0.118649 -0.273661 0.670565 0.254032 0.042339 0.076613 -0.463710
am 1.803931 -0.465726 -36.564012 -8.320565 0.190151 -0.338105 -0.204960 0.042339 0.248992 0.292339 0.046371
gear 2.135685 -0.649194 -50.802621 -6.358871 0.275988 -0.421081 -0.280403 0.076613 0.292339 0.544355 0.326613
carb -5.363105 1.520161 79.068750 83.036290 -0.078407 0.675790 -1.894113 -0.463710 0.046371 0.326613 2.608871
In [16]:
mtcars.corr() # Correlation
Out[16]:
mpg cyl disp hp drat wt qsec vs am gear carb
mpg 1.000000 -0.852162 -0.847551 -0.776168 0.681172 -0.867659 0.418684 0.664039 0.599832 0.480285 -0.550925
cyl -0.852162 1.000000 0.902033 0.832447 -0.699938 0.782496 -0.591242 -0.810812 -0.522607 -0.492687 0.526988
disp -0.847551 0.902033 1.000000 0.790949 -0.710214 0.887980 -0.433698 -0.710416 -0.591227 -0.555569 0.394977
hp -0.776168 0.832447 0.790949 1.000000 -0.448759 0.658748 -0.708223 -0.723097 -0.243204 -0.125704 0.749812
drat 0.681172 -0.699938 -0.710214 -0.448759 1.000000 -0.712441 0.091205 0.440278 0.712711 0.699610 -0.090790
wt -0.867659 0.782496 0.887980 0.658748 -0.712441 1.000000 -0.174716 -0.554916 -0.692495 -0.583287 0.427606
qsec 0.418684 -0.591242 -0.433698 -0.708223 0.091205 -0.174716 1.000000 0.744535 -0.229861 -0.212682 -0.656249
vs 0.664039 -0.810812 -0.710416 -0.723097 0.440278 -0.554916 0.744535 1.000000 0.168345 0.206023 -0.569607
am 0.599832 -0.522607 -0.591227 -0.243204 0.712711 -0.692495 -0.229861 0.168345 1.000000 0.794059 0.057534
gear 0.480285 -0.492687 -0.555569 -0.125704 0.699610 -0.583287 -0.212682 0.206023 0.794059 1.000000 0.274073
carb -0.550925 0.526988 0.394977 0.749812 -0.090790 0.427606 -0.656249 -0.569607 0.057534 0.274073 1.000000
In [17]:
# DATA VISUALIZATIONS:UNIVARIATE
#     LINE PLOTS
#     COLUMN BARS - VERTICAL & HORIZONTAL, STACKED
#     PIE CHARTS
# ADVANCED VISUALIZATIONS FOR DATA SCIENCE:
#     HISTOGRAM - Detect SKEWNESS, KURTOSIS, MISSING VALUES & OUTLIERS
#     BOXPLOT - Detect OUTLIERS, SKEWNESS, KURTOSIS, MISSING VALUES
#     DENSITY CURVES - Detect SKEWNESS, KURTOSIS, MISSING VALUES & OUTLIERS
# THE ABOVE THREE GRAPHS MUST FOR TARGET VARIABLE IF IT IS NUMERICAL & CONTINOUS

# OUTLIERS ARE EXTREME VALUES THAT FALL OUTSIDE THE NORMAL RANGE
#  LOWER OUTLIERS - Q1 - 1.5*INTER QUARTILE RANGE (Q3-Q1)
#  UPPER OUTLIERS - Q3 + 1.5*INTER QUARTILE RANGE (Q3-Q1)
    
In [18]:
mtcars.mpg.describe()  # Descriptive Statistics of mileage per gallon variable
Out[18]:
count    32.000000
mean     20.090625
std       6.026948
min      10.400000
25%      15.425000
50%      19.200000
75%      22.800000
max      33.900000
Name: mpg, dtype: float64
In [19]:
mtcars[['mpg','hp','wt']].describe() # Descriptive statistics of 3 columns
# Double Square brackets for selection of multiple variables
Out[19]:
mpg hp wt
count 32.000000 32.000000 32.000000
mean 20.090625 146.687500 3.217250
std 6.026948 68.562868 0.978457
min 10.400000 52.000000 1.513000
25% 15.425000 96.500000 2.581250
50% 19.200000 123.000000 3.325000
75% 22.800000 180.000000 3.610000
max 33.900000 335.000000 5.424000
In [20]:
mtcars.plot(kind='scatter',x='mpg',y='drat')
# kind= 'hist' or 'line' or 'bar' or 'box' or 'density' or 'pie' or 'scatter'
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x26fa4443da0>
In [21]:
plt.subplot(211)
mtcars.mpg.plot(kind='box',vert=False)
plt.subplot(212)
mtcars.mpg.plot(kind='density')
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x26fa51074e0>
In [22]:
# DATA MANIPULATION
# DO DESCRIBE OF MPG COLUMN
# CREATE A NEW VARIABLE "MILEAGETYPE" BASED ON MPG - HIGHMILEAGE(>22.80), 
# MEDIUMMILEAGE(>19.20 & <22.80) ,& lOW MILEAGE (<19.20)
In [23]:
mtcars.mpg.describe()
Out[23]:
count    32.000000
mean     20.090625
std       6.026948
min      10.400000
25%      15.425000
50%      19.200000
75%      22.800000
max      33.900000
Name: mpg, dtype: float64
In [24]:
mileagetype=[]
for row in mtcars.mpg:
    if row>=22.80:
        mileagetype.append("Highmileage")
    elif row>=19.20:
        mileagetype.append("Mediummileage")
    else:
        mileagetype.append("Lowmileage")
mtcars['mileagetype']=mileagetype
In [25]:
pd.value_counts(mtcars.mileagetype) # frequency counts of newly created variable
Out[25]:
Lowmileage       15
Highmileage       9
Mediummileage     8
Name: mileagetype, dtype: int64
In [26]:
# CREATE A NEW VARIABLE "SPEEDTYPE" BASED ON "HP" - HIGH, MEDIUM & LOW SPEED
In [27]:
speedtype=[]
for row in mtcars.hp:
    if row>=180:
        speedtype.append("highspeed")
    elif row>=123:
        speedtype.append("mediumspeed")
    else:
        speedtype.append("lowspeed")
mtcars['speedtype']=speedtype
In [28]:
pd.value_counts(mtcars.speedtype)
Out[28]:
lowspeed       15
highspeed      10
mediumspeed     7
Name: speedtype, dtype: int64
In [29]:
weighttype=[]
for row in mtcars.wt:
    if row>=3.61:
        weighttype.append("heavywt")
    elif row>=3.32:
        weighttype.append("mediumwt")
    else:
        weighttype.append("lowwt")
mtcars['weighttype']=weighttype
In [30]:
pd.value_counts(mtcars.weighttype)
Out[30]:
lowwt       16
mediumwt     8
heavywt      8
Name: weighttype, dtype: int64
In [31]:
pd.value_counts(mtcars.am)
# 0 means Automatic
# 1 means Manual
Out[31]:
0    19
1    13
Name: am, dtype: int64
In [32]:
# groupby function is a pivot function
# Is the Average mileage of Automatic & Manual Cars is Same or Equal
In [33]:
mtcars.mpg.groupby(mtcars.am).mean()
Out[33]:
am
0    17.147368
1    24.392308
Name: mpg, dtype: float64
In [34]:
mtcars.mpg.groupby(mtcars.am).var()
Out[34]:
am
0    14.699298
1    38.025769
Name: mpg, dtype: float64
In [35]:
# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE NOT EQUAL

# IF p-value IS LESS THAN 0.05, REJECT NULL & ACCEPT THE ALTERNATE
# IF p-value IS GREATER THAN 0.05, FAIL TO REJECT NULL & REJECT THE ALTERNATE
# 0.05 MEANS 95% CONFIDENCE LEVEL OR 5% ALPHA OR ERROR
In [36]:
automatic=mtcars[mtcars.am==0]
manual=mtcars[mtcars.am==1]
# Subsetting or splitting the Datafarmes into 2 dataframes 
# First one is only automatic cars and second only manual cars
In [37]:
from scipy import stats as st
In [38]:
# Scipy Stats package is for Statistical Modelling in Python
In [39]:
st.ttest_ind(automatic.mpg,manual.mpg,equal_var=False)
# Since p-value is less than 0.05, REJECT NULL
Out[39]:
Ttest_indResult(statistic=-3.767123145144923, pvalue=0.0013736383330710345)
In [40]:
# Is the Average weight of Automatic & manual car is equal
# groupby wt with am - mean
# groupby wt with am - variance
# frame null & alternate hypothesis
# conduct the relevant hypotheis test
In [41]:
mtcars.wt.groupby(mtcars.am).mean()
Out[41]:
am
0    3.768895
1    2.411000
Name: wt, dtype: float64
In [42]:
mtcars.wt.groupby(mtcars.am).var()
Out[42]:
am
0    0.604351
1    0.380666
Name: wt, dtype: float64
In [43]:
# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE weight OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE weight OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE NOT EQUAL
In [44]:
st.ttest_ind(automatic.wt,manual.wt,equal_var=False)
# since p-value is less than 0.05 Reject Null Hypothesis
Out[44]:
Ttest_indResult(statistic=5.4939049392100916, pvalue=6.2720199100801419e-06)
In [45]:
# ONLY TWO GROUPS - 2 SAMPLE INDEPENDENT TTEST
# MORE THAN 2 GROUPS - ANOVA SINGLE FACTOR
# Is the average mpg of heavyweight,mediumwt & lowweight cars is equal

# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF HEAVY,MEDIUM &
# lOW WEIGHT CARS - ALL MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF HEAVY,MEDIUM &
# lOW WEIGHT CARS - ALL MEANS ARE NOT EQUAL
In [46]:
mtcars.mpg.groupby(mtcars.weighttype).mean()
Out[46]:
weighttype
heavywt     14.6125
lowwt       24.5125
mediumwt    16.7250
Name: mpg, dtype: float64
In [47]:
heavywt=mtcars[mtcars.weighttype=='heavywt']
medwt=mtcars[mtcars.weighttype=='mediumwt']
lowwt=mtcars[mtcars.weighttype=='lowwt']
In [48]:
st.f_oneway(heavywt.mpg,medwt.mpg,lowwt.mpg)
Out[48]:
F_onewayResult(statistic=19.3396723713151, pvalue=4.604228434964327e-06)
In [49]:
# Is the Average MPG of ALL TYPES of GEARS CARS is Equal
In [50]:
mtcars.mpg.groupby(mtcars.gear).mean()
Out[50]:
gear
3    16.106667
4    24.533333
5    21.380000
Name: mpg, dtype: float64
In [51]:
g3=mtcars[mtcars.gear==3]
g4=mtcars[mtcars.gear==4]
g5=mtcars[mtcars.gear==5]
In [52]:
st.f_oneway(g3.mpg,g4.mpg,g5.mpg)
# Since p-value less than 0.05 Reject Null
Out[52]:
F_onewayResult(statistic=10.900719688660931, pvalue=0.00029482799285719474)
In [53]:
# BOTH VARIABLES ARE NON NUMERIC OR CATEGORICAL - CHI SQUARE TEST OF INDEPENDENCE
# INPUT OF CHI SQUARE TEST IS CROSS TABULATION (pd.crosstab())
In [54]:
pd.crosstab(mtcars.mileagetype,mtcars.weighttype)
Out[54]:
weighttype heavywt lowwt mediumwt
mileagetype
Highmileage 0 9 0
Lowmileage 7 1 7
Mediummileage 1 6 1
In [55]:
 st.chi2_contingency(pd.crosstab(mtcars.mileagetype,mtcars.weighttype))
# NULL - There is no relationship or association between both variables
# Alternate - There is relationship or association between both vriables
# Second item in output is p-value
# Since p-value is less than 0.05 Reject Null
Out[55]:
(22.266666666666666, 0.00017735151269303971, 4, array([[ 2.25,  4.5 ,  2.25],
        [ 3.75,  7.5 ,  3.75],
        [ 2.  ,  4.  ,  2.  ]]))
In [56]:
# Is there relationship or association between am & cyl?
In [57]:
pd.crosstab(mtcars.am,mtcars.cyl)
Out[57]:
cyl 4 6 8
am
0 3 4 12
1 8 3 2
In [58]:
st.chi2_contingency(pd.crosstab(mtcars.am,mtcars.cyl))
Out[58]:
(8.7407329512592682,
 0.012646605046107276,
 2,
 array([[ 6.53125,  4.15625,  8.3125 ],
        [ 4.46875,  2.84375,  5.6875 ]]))
In [59]:
mtcars.dtypes
Out[59]:
Unnamed: 0      object
mpg            float64
cyl              int64
disp           float64
hp               int64
drat           float64
wt             float64
qsec           float64
vs               int64
am               int64
gear             int64
carb             int64
mileagetype     object
speedtype       object
weighttype      object
dtype: object
In [60]:
objectcols=mtcars.select_dtypes(include=['object'])
# Select only Object or Character Columns
In [61]:
numbercols=mtcars.select_dtypes(include=['number'])
# Select only Numerical Columns both float and int
In [62]:
from sklearn.preprocessing import LabelEncoder
In [63]:
le=LabelEncoder() # label Encoder used for Dummy Variable Encoding
In [64]:
dummyobjectcols=objectcols.apply(le.fit_transform)
In [65]:
dummyobjectcols1=dummyobjectcols.drop('Unnamed: 0',axis=1)
In [66]:
mtcarsdf=pd.concat([numbercols,dummyobjectcols1],axis=1) # side by side merging 
In [67]:
mtcarsdf.head()
Out[67]:
mpg cyl disp hp drat wt qsec vs am gear carb mileagetype speedtype weighttype
0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 2 1 1
1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 2 1 1
2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 0 1 1
3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 2 1 1
4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 1 2 2
In [68]:
y=mtcarsdf.mpg
X=mtcarsdf.drop('mpg',axis=1)
# y is the Dependent Variable & X is Independent Variables in one matrix
In [69]:
from sklearn.linear_model import LinearRegression
In [70]:
LinReg=LinearRegression() # Assigning function to short name
In [71]:
LinRegmodel=LinReg.fit(X,y) # Fitting the model
In [72]:
LinRegmodel.score(X,y) # Checking the R Square
Out[72]:
0.90190896150328803
In [73]:
LinRegmodel.intercept_ # check intercept
Out[73]:
23.394352161958377
In [74]:
print(list(zip(X,LinRegmodel.coef_))) # check coefficients
[('cyl', 0.11649547289275182), ('disp', 0.015345016104492612), ('hp', -0.024956839717795803), ('drat', -0.14157799177590169), ('wt', -4.7944069590527967), ('qsec', 0.62394421207779338), ('vs', 2.0824828303494693), ('am', 2.3304302288736207), ('gear', -0.072299642384175133), ('carb', 0.58255390328537637), ('mileagetype', -1.4499234491818989), ('speedtype', 0.90917343394414107), ('weighttype', -1.7557182557494166)]
In [75]:
# mpg=23.3943+0.1164*cyl+0.0153*disp-0.024*hp-0.1415*drat-4.7944*wt+0.623*qsec...
In [76]:
mpgpredicted=LinRegmodel.predict(X) # predicted mpg for all observations
In [77]:
np.sqrt(np.mean((y-mpgpredicted)**2)) # Root Mean Square Error
Out[77]:
1.8578813371870724
In [78]:
resid=y-mpgpredicted # Residuals
In [79]:
np.sqrt(np.mean(resid**2)) # Root Mean Square Error Formula
Out[79]:
1.8578813371870724
In [80]:
from sklearn.tree import DecisionTreeRegressor
In [81]:
DecTree=DecisionTreeRegressor()
In [82]:
DecTreemodel=DecTree.fit(X,y)
In [83]:
DecTreemodel.score(X,y)
Out[83]:
1.0
In [84]:
DecTreepredict=DecTreemodel.predict(X)
In [85]:
np.sqrt(np.mean((y-DecTreepredict)**2))
Out[85]:
0.0
In [86]:
from sklearn.ensemble import RandomForestRegressor
In [87]:
RF=RandomForestRegressor(n_estimators=1000) # n_estimators=num of trees
In [88]:
RFmodel=RF.fit(X,y)
In [89]:
RFmodel.score(X,y)
Out[89]:
0.97853711558602552
In [90]:
RFpredict=RFmodel.predict(X)
In [91]:
np.sqrt(np.mean((y-RFpredict)**2))
Out[91]:
0.86905517355764206
In [ ]: