Regression: Pyspark - SQL
Regression: Pyspark - SQL
Regression: Pyspark - SQL
[3]: df = spark.read.format('csv').options(header='true',inferschema='true').
,→load("data.csv",header=True);
[5]: df.describe().toPandas()
Sales
0 200
1 14.022500000000003
2 5.217456565710477
3 1.6
4 27.0
[6]: df.printSchema()
root
1
|-- TV: double (nullable = true)
|-- Radio: double (nullable = true)
|-- Newspaper: double (nullable = true)
|-- Sales: double (nullable = true)
[7]: df.show(5)
+-----+-----+---------+-----+
| TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8| 69.2| 22.1|
| 44.5| 39.3| 45.1| 10.4|
| 17.2| 45.9| 69.3| 9.3|
|151.5| 41.3| 58.5| 18.5|
|180.8| 10.8| 58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows
[Stage 20:> (0 + 1) / 1]
+-----------------+-----+
| features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]| 9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows
2
featureIndexer =␣
,→VectorIndexer(inputCol="features",outputCol="indexedFeatures",maxCategories=4).
,→fit(transformed)
data = featureIndexer.transform(transformed)
data.show(5,True)
+-----------------+-----+-----------------+
| features|label| indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]| 9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
+-----------------+-----+-----------------+
only showing top 5 rows
[35]: # Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.8, 0.2])
model = pipeline.fit(trainingData)
3
for i in range(len(Summary.pValues)):
print ("##",'{:10.6f}'.format(coef[i]),\
'{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
'{:8.3f}'.format(Summary.tValues[i]),\
'{:10.6f}'.format(Summary.pValues[i]))
print ("##",'---')
print ("##","Mean squared error: % .6f" \
% Summary.meanSquaredError, ", RMSE: % .6f" \
% Summary.rootMeanSquaredError )
print ("##","Multiple R-squared: %f" % Summary.r2, ", \
Total iterations: %i"% Summary.totalIterations)
[40]: modelsummary(model.stages[-1])
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
[Stage 30:> (0 + 1) / 1]
Root Mean Squared Error (RMSE) on test data = 1.66064
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
4
print('r2_score: {0}'.format(r2_score))
r2_score: 0.8900904334948799
[ ]: