Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
107 views

Data Engineering 101 - Day 24 - SQL Vs PySpark

Books data engineering SQL

Uploaded by

sagarvshinde
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
107 views

Data Engineering 101 - Day 24 - SQL Vs PySpark

Books data engineering SQL

Uploaded by

sagarvshinde
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 82

Data Engineering 101 -

SQL vs PySpark
80 +
comparisons

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SELECT COLUMNS
SQL
SELECT column1, column2

1
FROM table;

PYSPARK
df.select("column1", "column2")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FILTER ROWS
SQL
SELECT * FROM table

2
WHERE condition;

PYSPARK
df.filter("condition")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

AGGREGATE FUNCTIONS
SQL
SELECT AVG(column)

3
FROM table;

PYSPARK
df.select(F.avg("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY
SQL
SELECT column, COUNT(*)

4
FROM table
GROUP BY column;

PYSPARK
df.groupBy("column").count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ORDER BY
SQL
SELECT *

5
FROM table
ORDER BY column ASC;

PYSPARK
df.orderBy("column",
ascending=True)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

JOIN
SQL
SELECT * FROM table1

6
JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

UNION
SQL
SELECT * FROM table1

7
UNION
SELECT * FROM table2;

PYSPARK
df1.union(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LIMIT
SQL
SELECT *

8
FROM table
LIMIT 100;

PYSPARK
df.limit(100)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DISTINCT VALUES
SQL
SELECT DISTINCT column

9
FROM table;

PYSPARK
df.select("column").distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ADDING A NEW COLUMN


SQL
SELECT *, (column1 + column2)

10
AS new_column
FROM table;

PYSPARK
df.withColumn("new_column",
F.col("column1") +
F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

COLUMN ALIAS
SQL
SELECT column AS alias_name

11
FROM table;

PYSPARK
df.select(F.col("column").alias("
alias_name"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FILTERING ON MULTIPLE
CONDITIONS
SQL
SELECT * FROM table

12
WHERE
condition1 AND condition2;

PYSPARK
df.filter((F.col("condition1")) &
(F.col("condition2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUBQUERY
SQL
SELECT * FROM

13
(SELECT * FROM table
WHERE condition) AS subquery;

PYSPARK
df.filter("condition").alias("subq
uery")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

BETWEEN
SQL
SELECT * FROM table

14
WHERE column
BETWEEN val1 AND val2;

PYSPARK
df.filter(F.col("column") \
.between("val1", "val2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LIKE
SQL
SELECT * FROM table

15
WHERE column LIKE pattern;

PYSPARK
df.filter(F.col("column") \
.like("pattern"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CASE WHEN
SQL
SELECT CASE

16
WHEN condition THEN result1
ELSE result2 END
FROM table;

PYSPARK
df.select(F.when(F.col("conditio
n"), "result1") \
.otherwise("result2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CAST DATA TYPE


SQL
SELECT

17
CAST(column AS datatype)
FROM table;

PYSPARK
df.select(F.col("column") \
.cast("datatype"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

COUNT DISTINCT
SQL
SELECT

18
COUNT(DISTINCT column)
FROM table;

PYSPARK
df.select(F.countDistinct("colu
mn"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUBSTRING
SQL
SELECT SUBSTRING(column,

19
start, length)
FROM table;

PYSPARK
df.select(F.substring("column",
start, length))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CONCATENATE COLUMNS
SQL
SELECT

20
CONCAT(column1, column2) AS
new_column
FROM table;

PYSPARK
df.withColumn("new_column",
F.concat(F.col("column1"),
F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

AVERAGE OVER PARTITION


SQL
SELECT AVG(column)

21
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("avg", F.avg("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUM OVER PARTITION


SQL
SELECT SUM(column)

22
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("sum", F.sum("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LEAD FUNCTION
SQL
SELECT LEAD(column, 1)

23
OVER (ORDER BY column2)
FROM table;

PYSPARK
df.withColumn("lead",
F.lead("column", 1) \
.over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LAG FUNCTION
SQL
SELECT LAG(column, 1)

24
OVER (ORDER BY column2)
FROM table;

PYSPARK
df.withColumn("lag", F.lag("column", 1) \
.over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ROW COUNT
SQL
SELECT COUNT(*)

25
FROM table;

PYSPARK
df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DROP COLUMN
SQL
ALTER TABLE table

26
DROP COLUMN column;

PYSPARK
df.drop("column")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RENAME COLUMN
SQL
ALTER TABLE table RENAME

27
COLUMN column1 TO column2;

PYSPARK
df.withColumnRenamed("colu
mn1", "column2")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CHANGE COLUMN TYPE


SQL
ALTER TABLE table

28
ALTER COLUMN column TYPE
new_type;

PYSPARK
df.withColumn("column",
df["column"] \
.cast("new_type"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CREATING A TABLE FROM


SELECT
SQL
CREATE TABLE new_table

29
AS SELECT * FROM table;

PYSPARK
(df.write.format("parquet") \
.saveAsTable("new_table"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

INSERTING SELECTED DATA


INTO TABLE
SQL
INSERT INTO table2

30
SELECT * FROM table1;

PYSPARK
(df1.write.insertInto("table2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CREATING A TABLE WITH


SPECIFIC COLUMNS
SQL
CREATE TABLE new_table

31
AS
SELECT column1, column2
FROM table;

PYSPARK
(df.select("column1", "column2") \
.write.format("parquet") \
.saveAsTable("new_table"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

AGGREGATE WITH ALIAS


SQL
SELECT column,

32
COUNT(*) AS count
FROM table
GROUP BY column;

PYSPARK
df.groupBy("column") \
.agg(F.count("*") \
.alias("count"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NESTED SUBQUERY
SQL
SELECT * FROM

33
(SELECT *
FROM table
WHERE condition) sub
WHERE sub.condition2;

PYSPARK
df.filter("condition") \
.alias("sub") \
.filter("sub.condition2")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

MULTIPLE JOINS
SQL
SELECT * FROM table1

34
JOIN table2
ON table1.id = table2.id
JOIN table3
ON table1.id = table3.id;

PYSPARK
df1.join(df2, "id").join(df3, "id")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CROSS JOIN
SQL
SELECT *

35
FROM table1
CROSS JOIN table2;

PYSPARK
df1.crossJoin(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY HAVING COUNT


GREATER THAN
SQL
SELECT column,

36
COUNT(*)
FROM table
GROUP BY column
HAVING COUNT(*) > 1;

PYSPARK
df.groupBy("column") \
.count() \
.filter(F.col("count") > 1)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ALIAS FOR TABLE IN JOIN


SQL
SELECT t1.*

37
FROM table1 t1
JOIN table2 t2
ON t1.id = t2.id;

PYSPARK
df1.alias("t1") \
.join(df2.alias("t2"), F.col("t1.id")
== F.col("t2.id"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SELECTING FROM MULTIPLE


TABLES
SQL
SELECT t1.column, t2.column

38
FROM table1 t1, table2 t2
WHERE t1.id = t2.id;

PYSPARK
df1.join(df2, df1.id == df2.id) \
.select(df1.column, df2.column)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CASE WHEN WITH MULTIPLE


CONDITIONS
SQL
SELECT CASE WHEN

39
condition THEN 'value1'
WHEN condition2 THEN 'value2' ELSE
'value3'
END
FROM table;

PYSPARK
df.select(F.when(F.col("condition"),
"value1").when(F.col("condition2"),
"value2").otherwise("value3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACTING DATE PARTS


SQL
SELECT EXTRACT(YEAR FROM

40
date_column)
FROM table;

PYSPARK
df.select(F.year(F.col("date_colu
mn")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

INEQUALITY FILTERING
SQL
SELECT *

41
FROM table
WHERE column != 'value';

PYSPARK
df.filter(df.column != 'value')

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

IN LIST
SQL
SELECT *

42
FROM table
WHERE column IN ('value1',
'value2');

PYSPARK
df.filter(df.column.isin('value1',
'value2'))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NOT IN LIST
SQL
SELECT *

43
FROM table
WHERE column NOT IN ('value1',
'value2');

PYSPARK
df.filter(~df.column.isin('value1',
'value2'))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NULL VALUES
SQL
SELECT * FROM

44
table
WHERE column IS NULL;

PYSPARK
df.filter(df.column.isNull())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NOT NULL VALUES


SQL
SELECT *

45
FROM table
WHERE column IS NOT NULL;

PYSPARK
df.filter(df.column.isNotNull())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING UPPER CASE


SQL
SELECT UPPER(column)

46
FROM table;

PYSPARK
df.select(F.upper(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING LOWER CASE


SQL
SELECT LOWER(column)

47
FROM table;

PYSPARK
df.select(F.lower(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING LENGTH
SQL
SELECT LENGTH(column)

48
FROM table;

PYSPARK
df.select(F.length(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

TRIM STRING
SQL
SELECT TRIM(column)

49
FROM table;

PYSPARK
df.select(F.trim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LEFT TRIM STRING


SQL
SELECT LTRIM(column)

50
FROM table;

PYSPARK
df.select(F.ltrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RIGHT TRIM STRING


SQL
SELECT RTRIM(column)

51
FROM table;

PYSPARK
df.select(F.rtrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING REPLACE
SQL
SELECT REPLACE(column, 'find',

52
'replace')
FROM table;

PYSPARK
df.select(F.regexp_replace(df.c
olumn, 'find', 'replace'))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUBSTRING INDEX
SQL
SELECT

53
SUBSTRING_INDEX(column,
'delim', count)
FROM table;

PYSPARK
df.select(F.expr("split(column,
'delim')[count-1]"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DATE DIFFERENCE
SQL
SELECT DATEDIFF('date1', 'date2')

54
FROM table;

PYSPARK
df.select(F.datediff(F.col('date1'
), F.col('date2')))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ADD MONTHS TO DATE


SQL
SELECT

55
ADD_MONTHS(date_column,
num_months)
FROM table;

PYSPARK
df.select(F.add_months
(df.date_column, num_months))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FIRST VALUE IN GROUP


SQL
SELECT FIRST_VALUE(column)

56
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("first_val",
F.first("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LAST VALUE IN GROUP


SQL
SELECT LAST_VALUE(column)

57
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("last_val",
F.last("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ROW NUMBER OVER


PARTITION
SQL
SELECT ROW_NUMBER()

58
OVER (PARTITION BY column
ORDER BY column)
FROM table;

PYSPARK
df.withColumn("row_num",
F.row_number() \
.over(Window.partitionBy("column") \
.orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RANK OVER PARTITION


SQL
SELECT RANK()

59
OVER (PARTITION BY column
ORDER BY column)
FROM table;

PYSPARK
df.withColumn("rank",
F.rank().over(Window.partitionBy
("column").orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DENSE RANK OVER


PARTITION
SQL
SELECT DENSE_RANK()

60
OVER (PARTITION BY column
ORDER BY column)
FROM table;

PYSPARK
df.withColumn("dense_rank",
F.dense_rank().over(Window.partitio
nBy("column").orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

COUNT ROWS
SQL
SELECT COUNT(*)

61
FROM table;

PYSPARK
df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

MATHEMATICAL
OPERATIONS
SQL
SELECT column1 + column2

62
FROM table;

PYSPARK
df.select(F.col("column1") +
F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING CONCATENATION
SQL
SELECT column1 | column2

63
AS new_column
FROM table;

PYSPARK
df.withColumn("new_column",
F.concat_ws("|",
F.col("column1"),
F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FIND MINIMUM VALUE


SQL
SELECT MIN(column)

64
FROM table;

PYSPARK
df.select(F.min("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FIND MAXIMUM VALUE


SQL
SELECT MAX(column)

65
FROM table;

PYSPARK
df.select(F.max("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

REMOVING DUPLICATES
SQL
SELECT DISTINCT *

66
FROM table;

PYSPARK
df.distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LEFT JOIN
SQL
SELECT * FROM table1

67
LEFT JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id,
"left")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RIGHT JOIN
SQL
SELECT * FROM table1

68
RIGHT JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id,
"right")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FULL OUTER JOIN


SQL
SELECT * FROM table1

69
FULL OUTER
JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id,
"outer")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY WITH HAVING


SQL
SELECT column, COUNT(*)

70
FROM table
GROUP BY column
HAVING COUNT(*) > 10;

PYSPARK
df.groupBy("column") \
.count() \
.filter(F.col("count") > 10)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ROUND DECIMAL VALUES


SQL
SELECT ROUND(column, 2)

71
FROM table;

PYSPARK
df.select(F.round("column", 2))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GET CURRENT DATE


SQL
SELECT CURRENT_DATE();

72
PYSPARK
df.select(F.current_date())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DATE ADDITION
SQL
SELECT

73
DATE_ADD(date_column, 10)
FROM table;

PYSPARK
df.select(F.date_add(F.col("dat
e_column"), 10))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DATE SUBTRACTION
SQL
SELECT

74
DATE_SUB(date_column, 10)
FROM table;

PYSPARK
df.select(F.date_sub(F.col("dat
e_column"), 10))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACT YEAR FROM DATE


SQL
SELECT YEAR(date_column)

75
FROM table;

PYSPARK
df.select(F.year(F.col("date_col
umn")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACT MONTH FROM


DATE
SQL
SELECT MONTH(date_column)

76
FROM table;

PYSPARK
df.select(F.month(F.col("date_
column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACT DAY FROM DATE


SQL
SELECT DAY(date_column)

77
FROM table;

PYSPARK
df.select(F.dayofmonth(F.col("d
ate_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SORTING DESCENDING
SQL
SELECT *

78
FROM table
ORDER BY column DESC;

PYSPARK
df.orderBy(F.col("column").desc())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY MULTIPLE
COLUMNS
SQL
SELECT col1, col2, COUNT(*)

79
FROM table
GROUP BY col1, col2;

PYSPARK
df.groupBy("col1", "col2") \
.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CONDITIONAL COLUMN
UPDATE
SQL
UPDATE table

80
SET column1 = CASE
WHEN condition
THEN 'value1' ELSE 'value2' END;

PYSPARK
df.withColumn("column1",
F.when(F.col("condition"),
"value1").otherwise("value2"))

Shwetank Singh
GritSetGrow - GSGLearn.com

You might also like