In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
mtcars=pd.read_csv("/Users/Rajesh Prabhakar/Desktop/Datasets/mtcars.csv")
In [4]:
# The first commands to run on data import:
# dfname.head() First 5 rows of data
# dfname.tail() Last t5 rows of data
# dfname.shape First no of rows & Second no of columns
# dfname.dtypes Individual Data Type of each variable
In [5]:
mtcars.head()
#head() # Function predefined & Paranthesis or brackets round must
Out[5]:
In [6]:
mtcars.tail() # last five rows of dataframe
Out[6]:
In [7]:
mtcars.shape # Number of rows and Number of columns
Out[7]:
In [8]:
mtcars.dtypes
Out[8]:
In [9]:
# The following Big Data Data Types:
# a) Numerical - Integer & Float
# b) String or Character or Text (" ")
# c) Boolean or Logical (True,False)
# d) Images - jpeg, png, tiff, satellite,
# e) Geo Spatial - latitude, longitudes, GPS, maps
# f) Internet of Things (Sensor based Data like Temperature)
# g) Social Media - Updates, Videos, Pictures, Tweets, Networks
# h) High Frequency Data - very fast updating stock prices, currency exchange rates
# i) Streaming Data - Tweets, Videos, Pics, Traffic
# j) Traffic Data - Web or Road Traffic
In [10]:
# BIG DATA ANALYTICS - DESCRIPTIVE ANALYTICS - UNIVARIATE STATISTICS
# MEASURES OF CENTRAL TENDENCY - MEAN, MEDIAN, MODE
# MEASURES OF DISPERSION - RANGE, VARIANCE, STANDARD DEVIATION, QUARTILES, PERCENTILES
# MEASURES OF ASSYMMETRY - SKEWNESS & KURTOSIS
# MEASURES OF RELATIONSHIP - COVARIANCE & CORRELATION (BIVARIATE STATISTICS)
In [11]:
mtcars.describe().transpose()
Out[11]:
In [12]:
mtcars.var() # Variance
Out[12]:
In [13]:
mtcars.skew()
# Positive Skewness Correction - Logarthmic or Square
# Negative Skewness Correction - Exponential or Power
Out[13]:
In [14]:
mtcars.kurt() # kurtosis
Out[14]:
In [15]:
mtcars.cov() # Covariance
Out[15]:
In [16]:
mtcars.corr() # Correlation
Out[16]:
In [17]:
# DATA VISUALIZATIONS:UNIVARIATE
# LINE PLOTS
# COLUMN BARS - VERTICAL & HORIZONTAL, STACKED
# PIE CHARTS
# ADVANCED VISUALIZATIONS FOR DATA SCIENCE:
# HISTOGRAM - Detect SKEWNESS, KURTOSIS, MISSING VALUES & OUTLIERS
# BOXPLOT - Detect OUTLIERS, SKEWNESS, KURTOSIS, MISSING VALUES
# DENSITY CURVES - Detect SKEWNESS, KURTOSIS, MISSING VALUES & OUTLIERS
# THE ABOVE THREE GRAPHS MUST FOR TARGET VARIABLE IF IT IS NUMERICAL & CONTINOUS
# OUTLIERS ARE EXTREME VALUES THAT FALL OUTSIDE THE NORMAL RANGE
# LOWER OUTLIERS - Q1 - 1.5*INTER QUARTILE RANGE (Q3-Q1)
# UPPER OUTLIERS - Q3 + 1.5*INTER QUARTILE RANGE (Q3-Q1)
In [18]:
mtcars.mpg.describe() # Descriptive Statistics of mileage per gallon variable
Out[18]:
In [19]:
mtcars[['mpg','hp','wt']].describe() # Descriptive statistics of 3 columns
# Double Square brackets for selection of multiple variables
Out[19]:
In [20]:
mtcars.plot(kind='scatter',x='mpg',y='drat')
# kind= 'hist' or 'line' or 'bar' or 'box' or 'density' or 'pie' or 'scatter'
Out[20]:
In [21]:
plt.subplot(211)
mtcars.mpg.plot(kind='box',vert=False)
plt.subplot(212)
mtcars.mpg.plot(kind='density')
Out[21]:
In [22]:
# DATA MANIPULATION
# DO DESCRIBE OF MPG COLUMN
# CREATE A NEW VARIABLE "MILEAGETYPE" BASED ON MPG - HIGHMILEAGE(>22.80),
# MEDIUMMILEAGE(>19.20 & <22.80) ,& lOW MILEAGE (<19.20)
In [23]:
mtcars.mpg.describe()
Out[23]:
In [24]:
mileagetype=[]
for row in mtcars.mpg:
if row>=22.80:
mileagetype.append("Highmileage")
elif row>=19.20:
mileagetype.append("Mediummileage")
else:
mileagetype.append("Lowmileage")
mtcars['mileagetype']=mileagetype
In [25]:
pd.value_counts(mtcars.mileagetype) # frequency counts of newly created variable
Out[25]:
In [26]:
# CREATE A NEW VARIABLE "SPEEDTYPE" BASED ON "HP" - HIGH, MEDIUM & LOW SPEED
In [27]:
speedtype=[]
for row in mtcars.hp:
if row>=180:
speedtype.append("highspeed")
elif row>=123:
speedtype.append("mediumspeed")
else:
speedtype.append("lowspeed")
mtcars['speedtype']=speedtype
In [28]:
pd.value_counts(mtcars.speedtype)
Out[28]:
In [29]:
weighttype=[]
for row in mtcars.wt:
if row>=3.61:
weighttype.append("heavywt")
elif row>=3.32:
weighttype.append("mediumwt")
else:
weighttype.append("lowwt")
mtcars['weighttype']=weighttype
In [30]:
pd.value_counts(mtcars.weighttype)
Out[30]:
In [31]:
pd.value_counts(mtcars.am)
# 0 means Automatic
# 1 means Manual
Out[31]:
In [32]:
# groupby function is a pivot function
# Is the Average mileage of Automatic & Manual Cars is Same or Equal
In [33]:
mtcars.mpg.groupby(mtcars.am).mean()
Out[33]:
In [34]:
mtcars.mpg.groupby(mtcars.am).var()
Out[34]:
In [35]:
# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE NOT EQUAL
# IF p-value IS LESS THAN 0.05, REJECT NULL & ACCEPT THE ALTERNATE
# IF p-value IS GREATER THAN 0.05, FAIL TO REJECT NULL & REJECT THE ALTERNATE
# 0.05 MEANS 95% CONFIDENCE LEVEL OR 5% ALPHA OR ERROR
In [36]:
automatic=mtcars[mtcars.am==0]
manual=mtcars[mtcars.am==1]
# Subsetting or splitting the Datafarmes into 2 dataframes
# First one is only automatic cars and second only manual cars
In [37]:
from scipy import stats as st
In [38]:
# Scipy Stats package is for Statistical Modelling in Python
In [39]:
st.ttest_ind(automatic.mpg,manual.mpg,equal_var=False)
# Since p-value is less than 0.05, REJECT NULL
Out[39]:
In [40]:
# Is the Average weight of Automatic & manual car is equal
# groupby wt with am - mean
# groupby wt with am - variance
# frame null & alternate hypothesis
# conduct the relevant hypotheis test
In [41]:
mtcars.wt.groupby(mtcars.am).mean()
Out[41]:
In [42]:
mtcars.wt.groupby(mtcars.am).var()
Out[42]:
In [43]:
# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE weight OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE weight OF AUTOMATIC &
# MANUAL CARS - BOTH MEANS ARE NOT EQUAL
In [44]:
st.ttest_ind(automatic.wt,manual.wt,equal_var=False)
# since p-value is less than 0.05 Reject Null Hypothesis
Out[44]:
In [45]:
# ONLY TWO GROUPS - 2 SAMPLE INDEPENDENT TTEST
# MORE THAN 2 GROUPS - ANOVA SINGLE FACTOR
# Is the average mpg of heavyweight,mediumwt & lowweight cars is equal
# NULL - THERE IS NO SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF HEAVY,MEDIUM &
# lOW WEIGHT CARS - ALL MEANS ARE EQUAL
# ALTERNATE - THERE IS SIGNIFICANT DIFFERENCE IN AVERAGE MPG OF HEAVY,MEDIUM &
# lOW WEIGHT CARS - ALL MEANS ARE NOT EQUAL
In [46]:
mtcars.mpg.groupby(mtcars.weighttype).mean()
Out[46]:
In [47]:
heavywt=mtcars[mtcars.weighttype=='heavywt']
medwt=mtcars[mtcars.weighttype=='mediumwt']
lowwt=mtcars[mtcars.weighttype=='lowwt']
In [48]:
st.f_oneway(heavywt.mpg,medwt.mpg,lowwt.mpg)
Out[48]:
In [49]:
# Is the Average MPG of ALL TYPES of GEARS CARS is Equal
In [50]:
mtcars.mpg.groupby(mtcars.gear).mean()
Out[50]:
In [51]:
g3=mtcars[mtcars.gear==3]
g4=mtcars[mtcars.gear==4]
g5=mtcars[mtcars.gear==5]
In [52]:
st.f_oneway(g3.mpg,g4.mpg,g5.mpg)
# Since p-value less than 0.05 Reject Null
Out[52]:
In [53]:
# BOTH VARIABLES ARE NON NUMERIC OR CATEGORICAL - CHI SQUARE TEST OF INDEPENDENCE
# INPUT OF CHI SQUARE TEST IS CROSS TABULATION (pd.crosstab())
In [54]:
pd.crosstab(mtcars.mileagetype,mtcars.weighttype)
Out[54]:
In [55]:
st.chi2_contingency(pd.crosstab(mtcars.mileagetype,mtcars.weighttype))
# NULL - There is no relationship or association between both variables
# Alternate - There is relationship or association between both vriables
# Second item in output is p-value
# Since p-value is less than 0.05 Reject Null
Out[55]:
In [56]:
# Is there relationship or association between am & cyl?
In [57]:
pd.crosstab(mtcars.am,mtcars.cyl)
Out[57]:
In [58]:
st.chi2_contingency(pd.crosstab(mtcars.am,mtcars.cyl))
Out[58]:
In [59]:
mtcars.dtypes
Out[59]:
In [60]:
objectcols=mtcars.select_dtypes(include=['object'])
# Select only Object or Character Columns
In [61]:
numbercols=mtcars.select_dtypes(include=['number'])
# Select only Numerical Columns both float and int
In [62]:
from sklearn.preprocessing import LabelEncoder
In [63]:
le=LabelEncoder() # label Encoder used for Dummy Variable Encoding
In [64]:
dummyobjectcols=objectcols.apply(le.fit_transform)
In [65]:
dummyobjectcols1=dummyobjectcols.drop('Unnamed: 0',axis=1)
In [66]:
mtcarsdf=pd.concat([numbercols,dummyobjectcols1],axis=1) # side by side merging
In [67]:
mtcarsdf.head()
Out[67]:
In [68]:
y=mtcarsdf.mpg
X=mtcarsdf.drop('mpg',axis=1)
# y is the Dependent Variable & X is Independent Variables in one matrix
In [69]:
from sklearn.linear_model import LinearRegression
In [70]:
LinReg=LinearRegression() # Assigning function to short name
In [71]:
LinRegmodel=LinReg.fit(X,y) # Fitting the model
In [72]:
LinRegmodel.score(X,y) # Checking the R Square
Out[72]:
In [73]:
LinRegmodel.intercept_ # check intercept
Out[73]:
In [74]:
print(list(zip(X,LinRegmodel.coef_))) # check coefficients
In [75]:
# mpg=23.3943+0.1164*cyl+0.0153*disp-0.024*hp-0.1415*drat-4.7944*wt+0.623*qsec...
In [76]:
mpgpredicted=LinRegmodel.predict(X) # predicted mpg for all observations
In [77]:
np.sqrt(np.mean((y-mpgpredicted)**2)) # Root Mean Square Error
Out[77]:
In [78]:
resid=y-mpgpredicted # Residuals
In [79]:
np.sqrt(np.mean(resid**2)) # Root Mean Square Error Formula
Out[79]:
In [80]:
from sklearn.tree import DecisionTreeRegressor
In [81]:
DecTree=DecisionTreeRegressor()
In [82]:
DecTreemodel=DecTree.fit(X,y)
In [83]:
DecTreemodel.score(X,y)
Out[83]:
In [84]:
DecTreepredict=DecTreemodel.predict(X)
In [85]:
np.sqrt(np.mean((y-DecTreepredict)**2))
Out[85]:
In [86]:
from sklearn.ensemble import RandomForestRegressor
In [87]:
RF=RandomForestRegressor(n_estimators=1000) # n_estimators=num of trees
In [88]:
RFmodel=RF.fit(X,y)
In [89]:
RFmodel.score(X,y)
Out[89]:
In [90]:
RFpredict=RFmodel.predict(X)
In [91]:
np.sqrt(np.mean((y-RFpredict)**2))
Out[91]:
In [ ]:
No comments:
Post a Comment