Regression: Pyspark - SQL

December 27, 2021

[2]: from pyspark.sql import SparkSession

sp= SparkSession.builder.appName("Python Spark regression example").
,→config("spark.some.config.option", "some-value").getOrCreate()

[3]: df = spark.read.format('csv').options(header='true',inferschema='true').

[4]: import pandas as pd

pd.DataFrame(df.take(3), columns=df.columns)

[4]: TV Radio Newspaper Sales

0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 9.3

[5]: df.describe().toPandas()

[5]: summary TV Radio Newspaper \

0 count 200 200 200
1 mean 147.0425 23.264000000000024 30.553999999999995
2 stddev 85.85423631490805 14.846809176168728 21.77862083852283
3 min 0.7 0.0 0.3
4 max 296.4 49.6 114.0

0 200
1 14.022500000000003
2 5.217456565710477
3 1.6
4 27.0

[6]: df.printSchema()


|-- TV: double (nullable = true)
|-- Radio: double (nullable = true)
|-- Newspaper: double (nullable = true)
|-- Sales: double (nullable = true)

[7]: df.show(5)

| TV|Radio|Newspaper|Sales|
|230.1| 37.8| 69.2| 22.1|
| 44.5| 39.3| 45.1| 10.4|
| 17.2| 45.9| 69.3| 9.3|
|151.5| 41.3| 58.5| 18.5|
|180.8| 10.8| 58.4| 12.9|
only showing top 5 rows

[32]: transformed= transData(df)


| features|label|
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]| 9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
only showing top 5 rows

[34]: from pyspark.ml import Pipeline

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.

# We specify maxCategories so features with > 4 distinct values are treated as␣

featureIndexer =␣


data = featureIndexer.transform(transformed)

| features|label| indexedFeatures|
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]| 9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
only showing top 5 rows

[35]: # Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.8, 0.2])

[36]: # Import LinearRegression class

from pyspark.ml.regression import LinearRegression

# Define LinearRegression algorithm

lr = LinearRegression()

[38]: import warnings

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, lr])

model = pipeline.fit(trainingData)

21/12/14 14:37:13 WARN Instrumentation: [8d638690] regParam is zero, which might

cause numerical instability and overfitting.

[39]: def modelsummary(model):

import numpy as np
print ("Note: the last rows are the information for Intercept")
print ("##","-------------------------------------------------")
print ("##"," Estimate | Std.Error | t Values | P-value")
coef = np.append(list(model.coefficients),model.intercept)

for i in range(len(Summary.pValues)):
print ("##",'{:10.6f}'.format(coef[i]),\

print ("##",'---')
print ("##","Mean squared error: % .6f" \
% Summary.meanSquaredError, ", RMSE: % .6f" \
% Summary.rootMeanSquaredError )
print ("##","Multiple R-squared: %f" % Summary.r2, ", \
Total iterations: %i"% Summary.totalIterations)

[40]: modelsummary(model.stages[-1])

Note: the last rows are the information for Intercept

## -------------------------------------------------
## Estimate | Std.Error | t Values | P-value
## 0.044758 0.001555 28.783 0.000000
## 0.186763 0.009541 19.575 0.000000
## 0.006556 0.007003 0.936 0.350575
## 2.921133 0.343975 8.492 0.000000
## ---
## Mean squared error: 2.828389 , RMSE: 1.681782
## Multiple R-squared: 0.897012 , Total iterations: 0

[41]: # Make predictions.

predictions = model.transform(testData)

[42]: from pyspark.ml.evaluation import RegressionEvaluator

# Select (prediction, true label) and compute test error
evaluator =␣

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.66064

[43]: y_true = predictions.select("label").toPandas()

y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)

print('r2_score: {0}'.format(r2_score))

r2_score: 0.8900904334948799

