import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

mtcars=pd.read_csv("/Users/Rajesh Prabhakar/Desktop/Datasets/mtcars.csv")

# The first commands to run on data import:
#     dfname.head() First 5 rows of data
#     dfname.tail() Last t5 rows of data
#     dfname.shape  First no of rows & Second no of columns
#     dfname.dtypes Individual Data Type of each variable

mtcars.head()
#head() # Function predefined & Paranthesis or brackets round must

mtcars.tail() # last five rows of dataframe

mtcars.shape # Number of rows and Number of columns

(32, 12)

mtcars.dtypes

Unnamed: 0     object
mpg           float64
cyl             int64
disp          float64
hp              int64
drat          float64
wt            float64
qsec          float64
vs              int64
am              int64
gear            int64
carb            int64
dtype: object

# The following Big Data Data Types:
#     a) Numerical - Integer & Float
#     b) String or Character or Text (" ")
#     c) Boolean or Logical (True,False)
#     d) Images - jpeg, png, tiff, satellite, 
#     e) Geo Spatial - latitude, longitudes, GPS, maps
#     f) Internet of Things (Sensor based Data like Temperature)
#     g) Social Media - Updates, Videos, Pictures, Tweets, Networks
#     h) High Frequency Data - very fast updating stock prices, currency exchange rates
#     i) Streaming Data - Tweets, Videos, Pics, Traffic
#     j) Traffic Data - Web or Road Traffic

# BIG DATA ANALYTICS - DESCRIPTIVE ANALYTICS - UNIVARIATE STATISTICS
# MEASURES OF CENTRAL TENDENCY - MEAN, MEDIAN, MODE
# MEASURES OF DISPERSION - RANGE, VARIANCE, STANDARD DEVIATION, QUARTILES, PERCENTILES
# MEASURES OF ASSYMMETRY - SKEWNESS & KURTOSIS 
# MEASURES OF RELATIONSHIP - COVARIANCE & CORRELATION (BIVARIATE STATISTICS)

mtcars.describe().transpose()

mtcars.var() # Variance

mpg        36.324103
cyl         3.189516
disp    15360.799829
hp       4700.866935
drat        0.285881
wt          0.957379
qsec        3.193166
vs          0.254032
am          0.248992
gear        0.544355
carb        2.608871
dtype: float64

mtcars.skew()
# Positive Skewness Correction - Logarthmic or Square
# Negative Skewness Correction - Exponential or Power

mpg     0.672377
cyl    -0.192261
disp    0.420233
hp      0.799407
drat    0.292780
wt      0.465916
qsec    0.406347
vs      0.264542
am      0.400809
gear    0.582309
carb    1.157091
dtype: float64

mtcars.kurt() # kurtosis

mpg    -0.022006
cyl    -1.762794
disp   -1.067523
hp      0.275212
drat   -0.450432
wt      0.416595
qsec    0.864931
vs     -2.063273
am     -1.966550
gear   -0.895292
carb    2.020059
dtype: float64

mtcars.cov() # Covariance

mtcars.corr() # Correlation

# DATA VISUALIZATIONS:UNIVARIATE
#     LINE PLOTS
#     COLUMN BARS - VERTICAL & HORIZONTAL, STACKED
#     PIE CHARTS
# ADVANCED VISUALIZATIONS FOR DATA SCIENCE:
#     HISTOGRAM - Detect SKEWNESS, KURTOSIS, MISSING VALUES & OUTLIERS
#     BOXPLOT - Detect OUTLIERS, SKEWNESS, KURTOSIS, MISSING VALUES
#     DENSITY CURVES - Detect SKEWNESS, KURTOSIS, MISSING VALUES & OUTLIERS
# THE ABOVE THREE GRAPHS MUST FOR TARGET VARIABLE IF IT IS NUMERICAL & CONTINOUS

# OUTLIERS ARE EXTREME VALUES THAT FALL OUTSIDE THE NORMAL RANGE
#  LOWER OUTLIERS - Q1 - 1.5*INTER QUARTILE RANGE (Q3-Q1)
#  UPPER OUTLIERS - Q3 + 1.5*INTER QUARTILE RANGE (Q3-Q1)

mtcars.mpg.describe()  # Descriptive Statistics of mileage per gallon variable

count    32.000000
mean     20.090625
std       6.026948
min      10.400000
25%      15.425000
50%      19.200000
75%      22.800000
max      33.900000
Name: mpg, dtype: float64

mtcars[['mpg','hp','wt']].describe() # Descriptive statistics of 3 columns
# Double Square brackets for selection of multiple variables

mtcars.plot(kind='scatter',x='mpg',y='drat')
# kind= 'hist' or 'line' or 'bar' or 'box' or 'density' or 'pie' or 'scatter'

<matplotlib.axes._subplots.AxesSubplot at 0x26fa4443da0>

plt.subplot(211)
mtcars.mpg.plot(kind='box',vert=False)
plt.subplot(212)
mtcars.mpg.plot(kind='density')

<matplotlib.axes._subplots.AxesSubplot at 0x26fa51074e0>

# DATA MANIPULATION
# DO DESCRIBE OF MPG COLUMN
# CREATE A NEW VARIABLE "MILEAGETYPE" BASED ON MPG - HIGHMILEAGE(>22.80), 
# MEDIUMMILEAGE(>19.20 & <22.80) ,& lOW MILEAGE (<19.20)

mtcars.mpg.describe()

count    32.000000
mean     20.090625
std       6.026948
min      10.400000
25%      15.425000
50%      19.200000
75%      22.800000
max      33.900000
Name: mpg, dtype: float64

mileagetype=[]
for row in mtcars.mpg:
    if row>=22.80:
        mileagetype.append("Highmileage")
    elif row>=19.20:
        mileagetype.append("Mediummileage")
    else:
        mileagetype.append("Lowmileage")
mtcars['mileagetype']=mileagetype

pd.value_counts(mtcars.mileagetype) # frequency counts of newly created variable

Lowmileage       15
Highmileage       9
Mediummileage     8
Name: mileagetype, dtype: int64

# CREATE A NEW VARIABLE "SPEEDTYPE" BASED ON "HP" - HIGH, MEDIUM & LOW SPEED

speedtype=[]
for row in mtcars.hp:
    if row>=180:
        speedtype.append("highspeed")
    elif row>=123:
        speedtype.append("mediumspeed")
    else:
        speedtype.append("lowspeed")
mtcars['speedtype']=speedtype

pd.value_counts(mtcars.speedtype)

lowspeed       15
highspeed      10
mediumspeed     7
Name: speedtype, dtype: int64

weighttype=[]
for row in mtcars.wt:
    if row>=3.61:
        weighttype.append("heavywt")
    elif row>=3.32:
        weighttype.append("mediumwt")
    else:
        weighttype.append("lowwt")
mtcars['weighttype']=weighttype

pd.value_counts(mtcars.weighttype)

lowwt       16
mediumwt     8
heavywt      8
Name: weighttype, dtype: int64

pd.value_counts(mtcars.am)
# 0 means Automatic
# 1 means Manual

0    19
1    13
Name: am, dtype: int64

# groupby function is a pivot function
# Is the Average mileage of Automatic & Manual Cars is Same or Equal

mtcars.mpg.groupby(mtcars.am).mean()

am
0    17.147368
1    24.392308
Name: mpg, dtype: float64

mtcars.mpg.groupby(mtcars.am).var()

am
0    14.699298
1    38.025769
Name: mpg, dtype: float64

# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE NOT EQUAL

# IF p-value IS LESS THAN 0.05, REJECT NULL & ACCEPT THE ALTERNATE
# IF p-value IS GREATER THAN 0.05, FAIL TO REJECT NULL & REJECT THE ALTERNATE
# 0.05 MEANS 95% CONFIDENCE LEVEL OR 5% ALPHA OR ERROR

automatic=mtcars[mtcars.am==0]
manual=mtcars[mtcars.am==1]
# Subsetting or splitting the Datafarmes into 2 dataframes 
# First one is only automatic cars and second only manual cars

from scipy import stats as st

# Scipy Stats package is for Statistical Modelling in Python

st.ttest_ind(automatic.mpg,manual.mpg,equal_var=False)
# Since p-value is less than 0.05, REJECT NULL

Ttest_indResult(statistic=-3.767123145144923, pvalue=0.0013736383330710345)

# Is the Average weight of Automatic & manual car is equal
# groupby wt with am - mean
# groupby wt with am - variance
# frame null & alternate hypothesis
# conduct the relevant hypotheis test

mtcars.wt.groupby(mtcars.am).mean()

am
0    3.768895
1    2.411000
Name: wt, dtype: float64

mtcars.wt.groupby(mtcars.am).var()

am
0    0.604351
1    0.380666
Name: wt, dtype: float64

# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE weight OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE weight OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE NOT EQUAL

st.ttest_ind(automatic.wt,manual.wt,equal_var=False)
# since p-value is less than 0.05 Reject Null Hypothesis

Ttest_indResult(statistic=5.4939049392100916, pvalue=6.2720199100801419e-06)

# ONLY TWO GROUPS - 2 SAMPLE INDEPENDENT TTEST
# MORE THAN 2 GROUPS - ANOVA SINGLE FACTOR
# Is the average mpg of heavyweight,mediumwt & lowweight cars is equal

# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF HEAVY,MEDIUM &
# lOW WEIGHT CARS - ALL MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF HEAVY,MEDIUM &
# lOW WEIGHT CARS - ALL MEANS ARE NOT EQUAL

mtcars.mpg.groupby(mtcars.weighttype).mean()

weighttype
heavywt     14.6125
lowwt       24.5125
mediumwt    16.7250
Name: mpg, dtype: float64

heavywt=mtcars[mtcars.weighttype=='heavywt']
medwt=mtcars[mtcars.weighttype=='mediumwt']
lowwt=mtcars[mtcars.weighttype=='lowwt']

st.f_oneway(heavywt.mpg,medwt.mpg,lowwt.mpg)

F_onewayResult(statistic=19.3396723713151, pvalue=4.604228434964327e-06)

# Is the Average MPG of ALL TYPES of GEARS CARS is Equal

mtcars.mpg.groupby(mtcars.gear).mean()

gear
3    16.106667
4    24.533333
5    21.380000
Name: mpg, dtype: float64

g3=mtcars[mtcars.gear==3]
g4=mtcars[mtcars.gear==4]
g5=mtcars[mtcars.gear==5]

st.f_oneway(g3.mpg,g4.mpg,g5.mpg)
# Since p-value less than 0.05 Reject Null

F_onewayResult(statistic=10.900719688660931, pvalue=0.00029482799285719474)

# BOTH VARIABLES ARE NON NUMERIC OR CATEGORICAL - CHI SQUARE TEST OF INDEPENDENCE
# INPUT OF CHI SQUARE TEST IS CROSS TABULATION (pd.crosstab())

pd.crosstab(mtcars.mileagetype,mtcars.weighttype)

 st.chi2_contingency(pd.crosstab(mtcars.mileagetype,mtcars.weighttype))
# NULL - There is no relationship or association between both variables
# Alternate - There is relationship or association between both vriables
# Second item in output is p-value
# Since p-value is less than 0.05 Reject Null

(22.266666666666666, 0.00017735151269303971, 4, array([[ 2.25,  4.5 ,  2.25],
        [ 3.75,  7.5 ,  3.75],
        [ 2.  ,  4.  ,  2.  ]]))

# Is there relationship or association between am & cyl?

pd.crosstab(mtcars.am,mtcars.cyl)

st.chi2_contingency(pd.crosstab(mtcars.am,mtcars.cyl))

(8.7407329512592682,
 0.012646605046107276,
 2,
 array([[ 6.53125,  4.15625,  8.3125 ],
        [ 4.46875,  2.84375,  5.6875 ]]))

mtcars.dtypes

Unnamed: 0      object
mpg            float64
cyl              int64
disp           float64
hp               int64
drat           float64
wt             float64
qsec           float64
vs               int64
am               int64
gear             int64
carb             int64
mileagetype     object
speedtype       object
weighttype      object
dtype: object

objectcols=mtcars.select_dtypes(include=['object'])
# Select only Object or Character Columns

numbercols=mtcars.select_dtypes(include=['number'])
# Select only Numerical Columns both float and int

from sklearn.preprocessing import LabelEncoder

le=LabelEncoder() # label Encoder used for Dummy Variable Encoding

dummyobjectcols=objectcols.apply(le.fit_transform)

dummyobjectcols1=dummyobjectcols.drop('Unnamed: 0',axis=1)

mtcarsdf=pd.concat([numbercols,dummyobjectcols1],axis=1) # side by side merging

mtcarsdf.head()

y=mtcarsdf.mpg
X=mtcarsdf.drop('mpg',axis=1)
# y is the Dependent Variable & X is Independent Variables in one matrix

from sklearn.linear_model import LinearRegression

LinReg=LinearRegression() # Assigning function to short name

LinRegmodel=LinReg.fit(X,y) # Fitting the model

LinRegmodel.score(X,y) # Checking the R Square

0.90190896150328803

LinRegmodel.intercept_ # check intercept

23.394352161958377

print(list(zip(X,LinRegmodel.coef_))) # check coefficients

[('cyl', 0.11649547289275182), ('disp', 0.015345016104492612), ('hp', -0.024956839717795803), ('drat', -0.14157799177590169), ('wt', -4.7944069590527967), ('qsec', 0.62394421207779338), ('vs', 2.0824828303494693), ('am', 2.3304302288736207), ('gear', -0.072299642384175133), ('carb', 0.58255390328537637), ('mileagetype', -1.4499234491818989), ('speedtype', 0.90917343394414107), ('weighttype', -1.7557182557494166)]

# mpg=23.3943+0.1164*cyl+0.0153*disp-0.024*hp-0.1415*drat-4.7944*wt+0.623*qsec...

mpgpredicted=LinRegmodel.predict(X) # predicted mpg for all observations

np.sqrt(np.mean((y-mpgpredicted)**2)) # Root Mean Square Error

1.8578813371870724

resid=y-mpgpredicted # Residuals

np.sqrt(np.mean(resid**2)) # Root Mean Square Error Formula

1.8578813371870724

from sklearn.tree import DecisionTreeRegressor

DecTree=DecisionTreeRegressor()

DecTreemodel=DecTree.fit(X,y)

DecTreemodel.score(X,y)

1.0

DecTreepredict=DecTreemodel.predict(X)

np.sqrt(np.mean((y-DecTreepredict)**2))

0.0

from sklearn.ensemble import RandomForestRegressor

RF=RandomForestRegressor(n_estimators=1000) # n_estimators=num of trees

RFmodel=RF.fit(X,y)

RFmodel.score(X,y)

0.97853711558602552

RFpredict=RFmodel.predict(X)

np.sqrt(np.mean((y-RFpredict)**2))

0.86905517355764206

	Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
0	Mazda RX4	21.0	6	160.0	110	3.90	2.620	16.46	0	1	4	4
1	Mazda RX4 Wag	21.0	6	160.0	110	3.90	2.875	17.02	0	1	4	4
2	Datsun 710	22.8	4	108.0	93	3.85	2.320	18.61	1	1	4	1
3	Hornet 4 Drive	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
4	Hornet Sportabout	18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2

	Unnamed: 0	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
27	Lotus Europa	30.4	4	95.1	113	3.77	1.513	16.9	1	1	5	2
28	Ford Pantera L	15.8	8	351.0	264	4.22	3.170	14.5	0	1	5	4
29	Ferrari Dino	19.7	6	145.0	175	3.62	2.770	15.5	0	1	5	6
30	Maserati Bora	15.0	8	301.0	335	3.54	3.570	14.6	0	1	5	8
31	Volvo 142E	21.4	4	121.0	109	4.11	2.780	18.6	1	1	4	2

	count	mean	std	min	25%	50%	75%	max
mpg	32.0	20.090625	6.026948	10.400	15.42500	19.200	22.80	33.900
cyl	32.0	6.187500	1.785922	4.000	4.00000	6.000	8.00	8.000
disp	32.0	230.721875	123.938694	71.100	120.82500	196.300	326.00	472.000
hp	32.0	146.687500	68.562868	52.000	96.50000	123.000	180.00	335.000
drat	32.0	3.596563	0.534679	2.760	3.08000	3.695	3.92	4.930
wt	32.0	3.217250	0.978457	1.513	2.58125	3.325	3.61	5.424
qsec	32.0	17.848750	1.786943	14.500	16.89250	17.710	18.90	22.900
vs	32.0	0.437500	0.504016	0.000	0.00000	0.000	1.00	1.000
am	32.0	0.406250	0.498991	0.000	0.00000	0.000	1.00	1.000
gear	32.0	3.687500	0.737804	3.000	3.00000	4.000	4.00	5.000
carb	32.0	2.812500	1.615200	1.000	2.00000	2.000	4.00	8.000

	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
mpg	36.324103	-9.172379	-633.097208	-320.732056	2.195064	-5.116685	4.509149	2.017137	1.803931	2.135685	-5.363105
cyl	-9.172379	3.189516	199.660282	101.931452	-0.668367	1.367371	-1.886855	-0.729839	-0.465726	-0.649194	1.520161
disp	-633.097208	199.660282	15360.799829	6721.158669	-47.064019	107.684204	-96.051681	-44.377621	-36.564012	-50.802621	79.068750
hp	-320.732056	101.931452	6721.158669	4700.866935	-16.451109	44.192661	-86.770081	-24.987903	-8.320565	-6.358871	83.036290
drat	2.195064	-0.668367	-47.064019	-16.451109	0.285881	-0.372721	0.087141	0.118649	0.190151	0.275988	-0.078407
wt	-5.116685	1.367371	107.684204	44.192661	-0.372721	0.957379	-0.305482	-0.273661	-0.338105	-0.421081	0.675790
qsec	4.509149	-1.886855	-96.051681	-86.770081	0.087141	-0.305482	3.193166	0.670565	-0.204960	-0.280403	-1.894113
vs	2.017137	-0.729839	-44.377621	-24.987903	0.118649	-0.273661	0.670565	0.254032	0.042339	0.076613	-0.463710
am	1.803931	-0.465726	-36.564012	-8.320565	0.190151	-0.338105	-0.204960	0.042339	0.248992	0.292339	0.046371
gear	2.135685	-0.649194	-50.802621	-6.358871	0.275988	-0.421081	-0.280403	0.076613	0.292339	0.544355	0.326613
carb	-5.363105	1.520161	79.068750	83.036290	-0.078407	0.675790	-1.894113	-0.463710	0.046371	0.326613	2.608871

	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
mpg	1.000000	-0.852162	-0.847551	-0.776168	0.681172	-0.867659	0.418684	0.664039	0.599832	0.480285	-0.550925
cyl	-0.852162	1.000000	0.902033	0.832447	-0.699938	0.782496	-0.591242	-0.810812	-0.522607	-0.492687	0.526988
disp	-0.847551	0.902033	1.000000	0.790949	-0.710214	0.887980	-0.433698	-0.710416	-0.591227	-0.555569	0.394977
hp	-0.776168	0.832447	0.790949	1.000000	-0.448759	0.658748	-0.708223	-0.723097	-0.243204	-0.125704	0.749812
drat	0.681172	-0.699938	-0.710214	-0.448759	1.000000	-0.712441	0.091205	0.440278	0.712711	0.699610	-0.090790
wt	-0.867659	0.782496	0.887980	0.658748	-0.712441	1.000000	-0.174716	-0.554916	-0.692495	-0.583287	0.427606
qsec	0.418684	-0.591242	-0.433698	-0.708223	0.091205	-0.174716	1.000000	0.744535	-0.229861	-0.212682	-0.656249
vs	0.664039	-0.810812	-0.710416	-0.723097	0.440278	-0.554916	0.744535	1.000000	0.168345	0.206023	-0.569607
am	0.599832	-0.522607	-0.591227	-0.243204	0.712711	-0.692495	-0.229861	0.168345	1.000000	0.794059	0.057534
gear	0.480285	-0.492687	-0.555569	-0.125704	0.699610	-0.583287	-0.212682	0.206023	0.794059	1.000000	0.274073
carb	-0.550925	0.526988	0.394977	0.749812	-0.090790	0.427606	-0.656249	-0.569607	0.057534	0.274073	1.000000

DataAdvanceR Labs

Wednesday, October 17, 2018

Python for Data Science - Basics, Descriptive Statistics, Hypothesis Testing,& Regression Models

	mpg	hp	wt
count	32.000000	32.000000	32.000000
mean	20.090625	146.687500	3.217250
std	6.026948	68.562868	0.978457
min	10.400000	52.000000	1.513000
25%	15.425000	96.500000	2.581250
50%	19.200000	123.000000	3.325000
75%	22.800000	180.000000	3.610000
max	33.900000	335.000000	5.424000