Pyspark Code
Pyspark Code
import pyspark
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("Read File")
sc = SparkContext.getOrCreate(conf=conf)
rdd = sc.parallelize((1,4,7,10))
or
listv = [1,4,7,10]
rdd = sc.parallelize(listv)
rdd = sc.textFile("/FileStore/tables/numbers.txt")
rdd.count()
rdd.filter(function) or rdd.filter(lambda x: x=='apple')
rdd.distinct()
WORD COUNT
flat = rdd.flatMap(lambda x: x.split(" "))
maprdd = flat.map(lambda x: (x,1))
(To convert single column to key-value format)
maprdd.groupByKey().mapValues(list).collect()
maprdd.groupByKey().mapValues(len).collect()
maprdd.reduceByKey(lambda x,y: x+y).collect() (here x and y are values of the same
key)
flat.countByValue()
rdd.saveAsTextFile('/FileStore/22march/')
rdd.getNumPartitions() --> 3
rdd1 = rdd.repartition(5) (increase or decrease the number of partitions)
rdd1.getNumPartitions() --> 5
Average(movie ratings)
rdd1 = rdd.map(lambda x: x.split(','))
rdd2 = rdd1.map(lambda x: (x[0],(int(x[1]),1)) ) (k,(v1,v2))
rdd3 = rdd2.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1])) #x[0] and y[0] are v1
and x[1],y[1] are v2
rdd4 = rdd3.map(lambda x: (x[0],x[1][0]/x[1][1])) x[1][0] - sum of ratings x[1][1]
- number of ratings
DataFrame
import pyspark.sql
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()
df = spark.read.option("header",True).csv('/FileStore/tables/StudentData.csv')
df.show()
df.count()
df.printSchema()
df = spark.read.options(inferSchema='True',header='True').csv("/FileStore/tables/
StudentData.csv")
CUSTOM SCHEMA
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
custom_schema = StructType([
StructField("age", IntegerType(), True),
StructField("gender", StringType(), True),
StructField("name", StringType(), True),
StructField("course", StringType(), True),
StructField("roll", StringType(), True),
StructField("marks", IntegerType(), True),
StructField("email", StringType(), True)
])
df = spark.read.options(header='True').schema(custom_schema).csv('/FileStore/
tables/StudentData.csv')
select on dataframe
df.select("name","gender").show()
df.select(df.name, df.gender).show()
df.select(col("name"), col("gender")).show()
df.select('*').show()
df.select(df.columns[2:6]).show()
withColumn on dataframe
df.filter(col("course") == "DB").show()
df.filter(df.course.isin(courses_list)).show()
df.select("gender").distinct().show()
df.dropDuplicates(["gender","course"]).show()
df.sort(df.marks).show(1000)
df.sort(df.marks.desc(), df.age.asc()).show(1000)
df.orderBy(df.marks, df.age).show()
df.groupBy("gender").sum("marks").show()
df.groupBy("gender").count().show()
df.groupBy("gender").max("marks").show()
df.groupBy("gender").min("marks").show()
df.groupBy("gender").avg("marks").show()
df.groupBy("gender").mean("marks").show()
df.join(df1,df.id == df1.id,"inner").show()
To directly use sql query by creating view (instead of table name give view name)
df.createOrReplaceTempView("karthick")
spark.sql("select course, gender, count(*) from karthick group by course,
gender").show()
def get_total_salary(salary):
return salary + 100
totalSalaryUDF = udf(lambda x: get_total_salary(x), IntegerType())
df.withColumn("total_salary", totalSalaryUDF(df.salary)).show()
SparkStreaming
rdd = ssc.textFileStream("/FileStore/tables/")
rdd = rdd.map(lambda x: (x,1))
rdd = rdd.reduceByKey(lambda x,y : x+y)
rdd.pprint()
ssc.start()
ssc.awaitTerminationOrTimeout(1000000)