Data Engineering 101 - Day 24 - SQL Vs PySpark
Data Engineering 101 - Day 24 - SQL Vs PySpark
SQL vs PySpark
80 +
comparisons
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
SELECT COLUMNS
SQL
SELECT column1, column2
1
FROM table;
PYSPARK
df.select("column1", "column2")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
FILTER ROWS
SQL
SELECT * FROM table
2
WHERE condition;
PYSPARK
df.filter("condition")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
AGGREGATE FUNCTIONS
SQL
SELECT AVG(column)
3
FROM table;
PYSPARK
df.select(F.avg("column"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
GROUP BY
SQL
SELECT column, COUNT(*)
4
FROM table
GROUP BY column;
PYSPARK
df.groupBy("column").count()
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
ORDER BY
SQL
SELECT *
5
FROM table
ORDER BY column ASC;
PYSPARK
df.orderBy("column",
ascending=True)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
JOIN
SQL
SELECT * FROM table1
6
JOIN table2
ON table1.id = table2.id;
PYSPARK
df1.join(df2, df1.id == df2.id)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
UNION
SQL
SELECT * FROM table1
7
UNION
SELECT * FROM table2;
PYSPARK
df1.union(df2)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
LIMIT
SQL
SELECT *
8
FROM table
LIMIT 100;
PYSPARK
df.limit(100)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
DISTINCT VALUES
SQL
SELECT DISTINCT column
9
FROM table;
PYSPARK
df.select("column").distinct()
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
10
AS new_column
FROM table;
PYSPARK
df.withColumn("new_column",
F.col("column1") +
F.col("column2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
COLUMN ALIAS
SQL
SELECT column AS alias_name
11
FROM table;
PYSPARK
df.select(F.col("column").alias("
alias_name"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
FILTERING ON MULTIPLE
CONDITIONS
SQL
SELECT * FROM table
12
WHERE
condition1 AND condition2;
PYSPARK
df.filter((F.col("condition1")) &
(F.col("condition2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
SUBQUERY
SQL
SELECT * FROM
13
(SELECT * FROM table
WHERE condition) AS subquery;
PYSPARK
df.filter("condition").alias("subq
uery")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
BETWEEN
SQL
SELECT * FROM table
14
WHERE column
BETWEEN val1 AND val2;
PYSPARK
df.filter(F.col("column") \
.between("val1", "val2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
LIKE
SQL
SELECT * FROM table
15
WHERE column LIKE pattern;
PYSPARK
df.filter(F.col("column") \
.like("pattern"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
CASE WHEN
SQL
SELECT CASE
16
WHEN condition THEN result1
ELSE result2 END
FROM table;
PYSPARK
df.select(F.when(F.col("conditio
n"), "result1") \
.otherwise("result2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
17
CAST(column AS datatype)
FROM table;
PYSPARK
df.select(F.col("column") \
.cast("datatype"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
COUNT DISTINCT
SQL
SELECT
18
COUNT(DISTINCT column)
FROM table;
PYSPARK
df.select(F.countDistinct("colu
mn"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
SUBSTRING
SQL
SELECT SUBSTRING(column,
19
start, length)
FROM table;
PYSPARK
df.select(F.substring("column",
start, length))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
CONCATENATE COLUMNS
SQL
SELECT
20
CONCAT(column1, column2) AS
new_column
FROM table;
PYSPARK
df.withColumn("new_column",
F.concat(F.col("column1"),
F.col("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
21
OVER (PARTITION BY column2)
FROM table;
PYSPARK
df.withColumn("avg", F.avg("column") \
.over(Window.partitionBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
22
OVER (PARTITION BY column2)
FROM table;
PYSPARK
df.withColumn("sum", F.sum("column") \
.over(Window.partitionBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
LEAD FUNCTION
SQL
SELECT LEAD(column, 1)
23
OVER (ORDER BY column2)
FROM table;
PYSPARK
df.withColumn("lead",
F.lead("column", 1) \
.over(Window.orderBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
LAG FUNCTION
SQL
SELECT LAG(column, 1)
24
OVER (ORDER BY column2)
FROM table;
PYSPARK
df.withColumn("lag", F.lag("column", 1) \
.over(Window.orderBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
ROW COUNT
SQL
SELECT COUNT(*)
25
FROM table;
PYSPARK
df.count()
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
DROP COLUMN
SQL
ALTER TABLE table
26
DROP COLUMN column;
PYSPARK
df.drop("column")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
RENAME COLUMN
SQL
ALTER TABLE table RENAME
27
COLUMN column1 TO column2;
PYSPARK
df.withColumnRenamed("colu
mn1", "column2")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
28
ALTER COLUMN column TYPE
new_type;
PYSPARK
df.withColumn("column",
df["column"] \
.cast("new_type"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
29
AS SELECT * FROM table;
PYSPARK
(df.write.format("parquet") \
.saveAsTable("new_table"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
30
SELECT * FROM table1;
PYSPARK
(df1.write.insertInto("table2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
31
AS
SELECT column1, column2
FROM table;
PYSPARK
(df.select("column1", "column2") \
.write.format("parquet") \
.saveAsTable("new_table"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
32
COUNT(*) AS count
FROM table
GROUP BY column;
PYSPARK
df.groupBy("column") \
.agg(F.count("*") \
.alias("count"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
NESTED SUBQUERY
SQL
SELECT * FROM
33
(SELECT *
FROM table
WHERE condition) sub
WHERE sub.condition2;
PYSPARK
df.filter("condition") \
.alias("sub") \
.filter("sub.condition2")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
MULTIPLE JOINS
SQL
SELECT * FROM table1
34
JOIN table2
ON table1.id = table2.id
JOIN table3
ON table1.id = table3.id;
PYSPARK
df1.join(df2, "id").join(df3, "id")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
CROSS JOIN
SQL
SELECT *
35
FROM table1
CROSS JOIN table2;
PYSPARK
df1.crossJoin(df2)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
36
COUNT(*)
FROM table
GROUP BY column
HAVING COUNT(*) > 1;
PYSPARK
df.groupBy("column") \
.count() \
.filter(F.col("count") > 1)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
37
FROM table1 t1
JOIN table2 t2
ON t1.id = t2.id;
PYSPARK
df1.alias("t1") \
.join(df2.alias("t2"), F.col("t1.id")
== F.col("t2.id"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
38
FROM table1 t1, table2 t2
WHERE t1.id = t2.id;
PYSPARK
df1.join(df2, df1.id == df2.id) \
.select(df1.column, df2.column)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
39
condition THEN 'value1'
WHEN condition2 THEN 'value2' ELSE
'value3'
END
FROM table;
PYSPARK
df.select(F.when(F.col("condition"),
"value1").when(F.col("condition2"),
"value2").otherwise("value3"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
40
date_column)
FROM table;
PYSPARK
df.select(F.year(F.col("date_colu
mn")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
INEQUALITY FILTERING
SQL
SELECT *
41
FROM table
WHERE column != 'value';
PYSPARK
df.filter(df.column != 'value')
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
IN LIST
SQL
SELECT *
42
FROM table
WHERE column IN ('value1',
'value2');
PYSPARK
df.filter(df.column.isin('value1',
'value2'))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
NOT IN LIST
SQL
SELECT *
43
FROM table
WHERE column NOT IN ('value1',
'value2');
PYSPARK
df.filter(~df.column.isin('value1',
'value2'))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
NULL VALUES
SQL
SELECT * FROM
44
table
WHERE column IS NULL;
PYSPARK
df.filter(df.column.isNull())
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
45
FROM table
WHERE column IS NOT NULL;
PYSPARK
df.filter(df.column.isNotNull())
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
46
FROM table;
PYSPARK
df.select(F.upper(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
47
FROM table;
PYSPARK
df.select(F.lower(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
STRING LENGTH
SQL
SELECT LENGTH(column)
48
FROM table;
PYSPARK
df.select(F.length(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
TRIM STRING
SQL
SELECT TRIM(column)
49
FROM table;
PYSPARK
df.select(F.trim(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
50
FROM table;
PYSPARK
df.select(F.ltrim(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
51
FROM table;
PYSPARK
df.select(F.rtrim(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
STRING REPLACE
SQL
SELECT REPLACE(column, 'find',
52
'replace')
FROM table;
PYSPARK
df.select(F.regexp_replace(df.c
olumn, 'find', 'replace'))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
SUBSTRING INDEX
SQL
SELECT
53
SUBSTRING_INDEX(column,
'delim', count)
FROM table;
PYSPARK
df.select(F.expr("split(column,
'delim')[count-1]"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
DATE DIFFERENCE
SQL
SELECT DATEDIFF('date1', 'date2')
54
FROM table;
PYSPARK
df.select(F.datediff(F.col('date1'
), F.col('date2')))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
55
ADD_MONTHS(date_column,
num_months)
FROM table;
PYSPARK
df.select(F.add_months
(df.date_column, num_months))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
56
OVER (PARTITION BY column2)
FROM table;
PYSPARK
df.withColumn("first_val",
F.first("column") \
.over(Window.partitionBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
57
OVER (PARTITION BY column2)
FROM table;
PYSPARK
df.withColumn("last_val",
F.last("column") \
.over(Window.partitionBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
58
OVER (PARTITION BY column
ORDER BY column)
FROM table;
PYSPARK
df.withColumn("row_num",
F.row_number() \
.over(Window.partitionBy("column") \
.orderBy("column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
59
OVER (PARTITION BY column
ORDER BY column)
FROM table;
PYSPARK
df.withColumn("rank",
F.rank().over(Window.partitionBy
("column").orderBy("column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
60
OVER (PARTITION BY column
ORDER BY column)
FROM table;
PYSPARK
df.withColumn("dense_rank",
F.dense_rank().over(Window.partitio
nBy("column").orderBy("column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
COUNT ROWS
SQL
SELECT COUNT(*)
61
FROM table;
PYSPARK
df.count()
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
MATHEMATICAL
OPERATIONS
SQL
SELECT column1 + column2
62
FROM table;
PYSPARK
df.select(F.col("column1") +
F.col("column2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
STRING CONCATENATION
SQL
SELECT column1 | column2
63
AS new_column
FROM table;
PYSPARK
df.withColumn("new_column",
F.concat_ws("|",
F.col("column1"),
F.col("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
64
FROM table;
PYSPARK
df.select(F.min("column"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
65
FROM table;
PYSPARK
df.select(F.max("column"))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
REMOVING DUPLICATES
SQL
SELECT DISTINCT *
66
FROM table;
PYSPARK
df.distinct()
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
LEFT JOIN
SQL
SELECT * FROM table1
67
LEFT JOIN table2
ON table1.id = table2.id;
PYSPARK
df1.join(df2, df1.id == df2.id,
"left")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
RIGHT JOIN
SQL
SELECT * FROM table1
68
RIGHT JOIN table2
ON table1.id = table2.id;
PYSPARK
df1.join(df2, df1.id == df2.id,
"right")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
69
FULL OUTER
JOIN table2
ON table1.id = table2.id;
PYSPARK
df1.join(df2, df1.id == df2.id,
"outer")
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
70
FROM table
GROUP BY column
HAVING COUNT(*) > 10;
PYSPARK
df.groupBy("column") \
.count() \
.filter(F.col("count") > 10)
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
71
FROM table;
PYSPARK
df.select(F.round("column", 2))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
72
PYSPARK
df.select(F.current_date())
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
DATE ADDITION
SQL
SELECT
73
DATE_ADD(date_column, 10)
FROM table;
PYSPARK
df.select(F.date_add(F.col("dat
e_column"), 10))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
DATE SUBTRACTION
SQL
SELECT
74
DATE_SUB(date_column, 10)
FROM table;
PYSPARK
df.select(F.date_sub(F.col("dat
e_column"), 10))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
75
FROM table;
PYSPARK
df.select(F.year(F.col("date_col
umn")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
76
FROM table;
PYSPARK
df.select(F.month(F.col("date_
column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
77
FROM table;
PYSPARK
df.select(F.dayofmonth(F.col("d
ate_column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
SORTING DESCENDING
SQL
SELECT *
78
FROM table
ORDER BY column DESC;
PYSPARK
df.orderBy(F.col("column").desc())
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
GROUP BY MULTIPLE
COLUMNS
SQL
SELECT col1, col2, COUNT(*)
79
FROM table
GROUP BY col1, col2;
PYSPARK
df.groupBy("col1", "col2") \
.count()
Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK
CONDITIONAL COLUMN
UPDATE
SQL
UPDATE table
80
SET column1 = CASE
WHEN condition
THEN 'value1' ELSE 'value2' END;
PYSPARK
df.withColumn("column1",
F.when(F.col("condition"),
"value1").otherwise("value2"))
Shwetank Singh
GritSetGrow - GSGLearn.com