sc

usedcar=spark.read.csv("data/usedcarsales.csv",header=True,inferSchema=True)

# NOTES FOR IMPORT
# UPLOAD THE DATA USING MyData TAB RIGHT HAND TOP CORNER FIRST. ONCE UPLOADED SEE IN DATA FOLDER
# RIGHT CLICK ON THE FILE AND FIND COPY PATH SECOND OPTION FROM BOTTOM PASTE IN DOUBLE QUOTES
# DO NOT FORGET THE HEADER & INFERSCHEMA ARGUMENTS AS TRUE ELSE DATA WILL IPORTED AS STRINGS

usedcar.printSchema()  # FOR CHECKING THE INDIVUDAL DATA TYPES. STANDARD COMMAND TO BE USED ON ANY DATA

root
 |-- _c0: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- KM: integer (nullable = true)
 |-- FuelType: string (nullable = true)
 |-- HP: integer (nullable = true)
 |-- MetColor: integer (nullable = true)
 |-- Automatic: integer (nullable = true)
 |-- CC: integer (nullable = true)
 |-- Doors: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- AutoType: string (nullable = true)
 |-- MetColorType: string (nullable = true)

usedcar.describe().toPandas().transpose()
# BASIC DESCRIPTIVE STATISTICS - MEAN, STANDARD DEVIATION MIN & MAX

# THE BELOW COMMANDS ARE FOR SPLITTING THE DATA FRAME INTO TWO NUMERIC & STRING 
# THIS OPERATION IS DONE FOR DOING THE CLEANING AND PREPROCESSING LIKE MISSING VALUES 
# NUMERIC MISSING VALUES - IMPUTE WITH MEAN OR MEDIAN OR CAP WITH MIN OR MAX
# STRING OR CATEGORICAL MISSING VALUES - IMPUTE WITH MODE OR MOST FREQUENT

integercols=[item[0] for item in usedcar.dtypes if item[1]=='int'] # for integer - "int" not "integer"

stringcols=[item[0] for item in usedcar.dtypes if item[1]=='string']

# StringIndexer encodes a string column of labels to a column of label indices. 
# The indices are in [0, numLabels), ordered by label frequencies,
#                     so the most frequent label gets index 0. 
# THE BELOW STRING INDEXER IS STANDARD INDEXER CAN BE USED FOR ANY TYPE OF DATA USING THE FOR LOOP
# ONLY ONCE DO THE STRING INDEXING & DO NOT DO STRING INDEXING ON NUMERICAL DATA
# SPLIT DATA INTO STRING FIRST AND RUN THE INDEXER ONLY ON STRING DATA

from pyspark.ml.feature import StringIndexer

stringindex=[StringIndexer(inputCol=c,outputCol="stringindex_"+c) 
             for c in stringcols] # stringcols=['FuelType','AutoType','MetColorType']

# THE ABOVE STRINGINDEXER RUNS ON THE THREE STRING COLS AND CREATES NEW COLUMN WITH DUMMY VARIABLES
# NEW COLUMNS WIL BE ADDED AT THE END WITH STRINGINDEX + RELEVANT COLUMN NAME 
# THE STRING COLUMNS ARE RETAINED AS IT IS & NEW COLUMNS ARE ADDED AT THE END

from pyspark.ml import Pipeline

# A Pipeline is specified as a sequence of stages, and each stage is either a Transformer or an Estimator. 
# These stages are run in order, and the input DataFrame is transformed as it passes through each stage.
# For Transformer stages, the transform() method is called on the DataFrame. 
# For Estimator stages, the fit() method is called to produce a Transformer (which becomes part of the 
# PipelineModel, or fitted Pipeline), 
# and that Transformer’s transform() method is called on the DataFrame.

pipeline=Pipeline(stages=stringindex)

pipelinemodel=pipeline.fit(usedcar)

usedcar1=pipelinemodel.transform(usedcar)

# CALL THE FUNCTION AND RENAME IT TO SMALL NAME. THEN FIT THE FUNCTION ON DATAFRAME 
# ONCE FIT TRANSFORM MUST BE DONE. IF TRANSFORM NOT DONE THE PROCESS FAILS WITH ERROR

usedcar1.printSchema() # OBSERVE THE LAST 3 COLUMNS SPECIFICALLY WITH STRINGINDEX PREFIX

root
 |-- _c0: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- KM: integer (nullable = true)
 |-- FuelType: string (nullable = true)
 |-- HP: integer (nullable = true)
 |-- MetColor: integer (nullable = true)
 |-- Automatic: integer (nullable = true)
 |-- CC: integer (nullable = true)
 |-- Doors: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- AutoType: string (nullable = true)
 |-- MetColorType: string (nullable = true)
 |-- stringindex_FuelType: double (nullable = false)
 |-- stringindex_AutoType: double (nullable = false)
 |-- stringindex_MetColorType: double (nullable = false)

doublecols=[item[0] for item in usedcar1.dtypes if item[1]=='double']
integercols=[item[0] for item in usedcar1.dtypes if item[1]=='int']

finalcols=integercols+doublecols # THIS IS A LIST NOT DATA FRAME

usedcar2=usedcar1.select(finalcols) # THIS STEP CREATES THE DATAFRAME

# THE ABOVE TWO STEPS WILL RESELECT THE REQUIRED COLUMNS AND CREATES THE NEW DATA FRAME.

usedcar2.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- KM: integer (nullable = true)
 |-- HP: integer (nullable = true)
 |-- MetColor: integer (nullable = true)
 |-- Automatic: integer (nullable = true)
 |-- CC: integer (nullable = true)
 |-- Doors: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- stringindex_FuelType: double (nullable = false)
 |-- stringindex_AutoType: double (nullable = false)
 |-- stringindex_MetColorType: double (nullable = false)

from pyspark.ml.linalg import DenseVector

# A dense vector is backed by a double array representing its entry values, 
# while a sparse vector is backed by two parallel arrays: indices and values. 
# For example, a vector (1.0, 0.0, 3.0) can be represented in dense format as [1.0, 0.0, 3.0]
# in sparse format as (3, [0, 2], [1.0, 3.0]), where 3 is the size of the vector.

# FOR RUNNING THE MACHINE LEARNING ALGORITHMS THE DENSE VECTOR FORMAT IS A MUST
# THE COLUMN NAMES OF DENSE VECTOR MUST BE FEATURES(INDEPENDENT VARIABLES) & LABEL(DEPENDENTVARIABLE)

usedcardf=usedcar2.rdd.map(lambda x:(DenseVector(x[2:11]),x[1])).toDF(['features','label'])
# DELETING COLUMN O AS IT IS SERIAL NUMBER COLUMN AND DELETED LAST TWO COLUMNS AUTOTYPE & METCOLORYTYPE
# AS THEY ARE DUPLICATED COLUMNS AUTOMATIC AND METCOLOR

usedcardf.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[23.0,46986.0,90....|13500|
|[23.0,72937.0,90....|13750|
|[24.0,41711.0,90....|13950|
|[26.0,48000.0,90....|14950|
|[30.0,38500.0,90....|13750|
+--------------------+-----+
only showing top 5 rows

from pyspark.ml.regression import LinearRegression

LinReg=LinearRegression(featuresCol='features',labelCol='label')

LinRegmodel=LinReg.fit(usedcardf)

LinRegmodel.intercept

-6635.867399994437

print("Coefficients: "+ str(LinRegmodel.coefficients))

Coefficients: [-122.91984669359219,-0.016023361057631247,29.859896169290355,38.513561851052636,179.38304407378993,-1.3684020490676259,-65.39933191930807,23.051496825370904,-296.0000936943593]

 LinRegmodel.summary.r2

0.8657147053312059

 LinRegmodel.summary.rootMeanSquaredError

1328.6367897244356

LinRegmodel.summary.pValues

[0.0,
 0.0,
 0.0,
 0.6118420013048862,
 0.2543674984358093,
 1.5863129689153155e-05,
 0.09815739330353734,
 0.0,
 0.06941297616856668,
 4.5791082037283104e-10]

from pyspark.ml.regression import DecisionTreeRegressor

DecTree=DecisionTreeRegressor(featuresCol='features',labelCol='label')

DecTreemodel=DecTree.fit(usedcardf)

DecTreemodelpredict=DecTreemodel.transform(usedcardf)

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(DecTreemodelpredict)
print("Root Mean Squared Error (RMSE) on usedcardf data = %g" % rmse)

Root Mean Squared Error (RMSE) on usedcardf data = 1118.17

from pyspark.ml.regression import RandomForestRegressor

# Random forests are ensembles of decision trees. 
# Random forests combine many decision trees in order to reduce the risk of overfitting. 
# The spark.ml implementation supports random forests for binary and multiclass classification 
# and for regression, using both continuous and categorical features.

RF=RandomForestRegressor(featuresCol='features',labelCol='label',numTrees=1000)

Rfmodel=RF.fit(usedcardf)

Rfmodelpredict=Rfmodel.transform(usedcardf)

Rfrmse = evaluator.evaluate(Rfmodelpredict)
print("Root Mean Squared Error (RMSE) on usedcardf data = %g" % Rfrmse)

Root Mean Squared Error (RMSE) on usedcardf data = 1145.37

from pyspark.ml.regression import GBTRegressor

# Gradient-Boosted Trees (GBTs) are ensembles of decision trees. 
# GBTs iteratively train decision trees in order to minimize a loss function. 
# The spark.ml implementation supports GBTs for binary classification and for regression, 
# using both continuous and categorical features.

GBTree=GBTRegressor(featuresCol='features',labelCol='label')

GBTreemodel=GBTree.fit(usedcardf)

GBTreemodelpredict=GBTreemodel.transform(usedcardf)

GBTrmse = evaluator.evaluate(GBTreemodelpredict)
print("Root Mean Squared Error (RMSE) on usedcardf data = %g" % GBTrmse)

Root Mean Squared Error (RMSE) on usedcardf data = 878.067

	0	1	2	3	4
summary	count	mean	stddev	min	max
_c0	1436	718.5	414.6818057257878	1	1436
Price	1436	10730.824512534818	3626.9645849102403	4350	32500
Age	1436	55.94707520891365	18.599988342806252	1	80
KM	1436	68533.25974930362	37506.448872189554	1	243000
FuelType	1436	None	None	CNG	Petrol
HP	1436	101.50208913649026	14.981079675567686	69	192
MetColor	1436	0.674791086350975	0.46861604925657424	0	1
Automatic	1436	0.055710306406685235	0.22944133861584212	0	1
CC	1436	1566.8279944289693	187.1824363291632	1300	2000
Doors	1436	4.0334261838440115	0.9526766046325884	2	5
Weight	1436	1072.4596100278552	52.64112048693142	1000	1615
AutoType	1436	None	None	Auto	Manual
MetColorType	1436	None	None	Metcolor	NonMetcolor

DataAdvanceR Labs

Sunday, October 14, 2018

Apache Spark PySpark Case Study - Used Car Sales Regression Techniques

No comments:

Post a Comment