Window Function in Pyspark
Window Function in Pyspark
findspark.init()
import pyspark
In [6]: spark=SparkSession.builder.appName('Arun_test').getOrCreate()
('Arjun','MP',30000),
('James','USA',20000),
('Kumar',"UK",35000),
('Ajay','UK',50000)]
columns = ['Name','State','Salary']
df = spark.createDataFrame(data=simpleData,schema=columns)
In [35]: df.show(truncate=False)
+-----+-----+------+
|Name |State|Salary|
+-----+-----+------+
|Arjun|MP |30000 |
|James|USA |20000 |
|Kumar|UK |35000 |
+-----+-----+------+
In [15]: df.printSchema()
root
Ranking Functions
row_number()
rank()
dense_rank()
ntile()
percent_rank()
df.withColumn('RN',row_number().over(WindFunc)).show()
+-----+-----+------+---+
| Name|State|Salary| RN|
+-----+-----+------+---+
+-----+-----+------+---+
df.withColumn('Person_Rank',rank().over(WindFunc)).show(truncate=False)
+-----+-----+------+-----------+
|Name |State|Salary|Person_Rank|
+-----+-----+------+-----------+
|James|USA |20000 |1 |
|Kumar|UK |35000 |3 |
|Arjun|MP |30000 |1 |
+-----+-----+------+-----------+
df.withColumn('Person_Dense_Rank',dense_rank().over(WindFunc)).show(truncate=False)
+-----+-----+------+-----------------+
|Name |State|Salary|Person_Dense_Rank|
+-----+-----+------+-----------------+
|James|USA |20000 |1 |
|Kumar|UK |35000 |2 |
|Arjun|MP |30000 |1 |
+-----+-----+------+-----------------+
df.withColumn('percent_rank',percent_rank().over(WindFunc)).show()
+-----+-----+------+------------+
| Name|State|Salary|percent_rank|
+-----+-----+------+------------+
+-----+-----+------+------------+
df.withColumn("ntile_values",ntile(3).over(WindFunc)).show()
+-----+-----+------+------------+
| Name|State|Salary|ntile_values|
+-----+-----+------+------------+
+-----+-----+------+------------+
In [44]: df.withColumn("ntile_values",ntile(2).over(WindFunc)).show()
+-----+-----+------+------------+
| Name|State|Salary|ntile_values|
+-----+-----+------+------------+
+-----+-----+------+------------+
In [45]: df.withColumn("ntile_values",ntile(1).over(WindFunc)).show()
+-----+-----+------+------------+
| Name|State|Salary|ntile_values|
+-----+-----+------+------------+
+-----+-----+------+------------+
df.withColumn("cume_dist_value",cume_dist().over(WindFunc)).show(truncate= False)
+-----+-----+------+------------------+
|Name |State|Salary|cume_dist_value |
+-----+-----+------+------------------+
+-----+-----+------+------------------+
df.withColumn("lag_value",lag('salary',1).over(WindFunc)).show()
+-----+-----+------+---------+
| Name|State|Salary|lag_value|
+-----+-----+------+---------+
+-----+-----+------+---------+
WindFunc_new = Window.orderBy(col('Salary').desc())
df.withColumn('lag_val',lag('salary',1).over(WindFunc_new)).show()
+-----+-----+------+-------+
| Name|State|Salary|lag_val|
+-----+-----+------+-------+
+-----+-----+------+-------+
df.withColumn("load_value",lead("salary",1).over(WindFunc)).show(truncate=False)
+-----+-----+------+----------+
|Name |State|Salary|load_value|
+-----+-----+------+----------+
+-----+-----+------+----------+
In [53]: df.withColumn('leadValue_without_partition',lead('salary',1).over(WindFunc_new)).show(truncate=False)
df.withColumn('leadValue_without_partition',lead('salary',2).over(WindFunc_new)).show(truncate=False)
+-----+-----+------+---------------------------+
|Name |State|Salary|leadValue_without_partition|
+-----+-----+------+---------------------------+
+-----+-----+------+---------------------------+
+-----+-----+------+---------------------------+
|Name |State|Salary|leadValue_without_partition|
+-----+-----+------+---------------------------+
+-----+-----+------+---------------------------+
Please note
that I have used WindFunc(orderBy is mandatory) with row_number since it is a ranking function and windAgg(without orderBy) with the aggregate
functions
df.withColumn("row",row_number().over(WindFunc)) \
.withColumn("min",min(col('salary')).over(windAgg)) \
.withColumn('max',max(col('salary')).over(windAgg)) \
.withColumn('avg',avg('salary').over(windAgg)) \
.withColumn('sum',sum('salary').over(windAgg))\
.where(col('row') ==1) \
.select('state','min','max','avg','sum').show()
+-----+-----+-----+-------+------+
+-----+-----+-----+-------+------+
| USA|20000|20000|20000.0| 20000|
| UK|35000|50000|45000.0|135000|
| MP|30000|30000|30000.0| 30000|
+-----+-----+-----+-------+------+