Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
128 views

Pyspark coding questions from StrataScratch platform

Uploaded by

vigneshdataprof2
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
128 views

Pyspark coding questions from StrataScratch platform

Uploaded by

vigneshdataprof2
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 23

Stratascratch pyspark coding questions

Easy

1. Salaries Differences

# Import your libraries


from pyspark.sql.functions import col, max,abs

# Start writing code


df = db_employee.join(db_dept, db_employee['department_id'] == db_dept['id'], 'left')
df_mkt = df.filter(col('department') == 'marketing').select(max('salary').alias('mkt'))
df_eng = df.filter(col('department') == 'engineering').select(max('salary').alias('eng'))
df2 = df_mkt.join(df_eng).withColumn('salary_diff', abs(col('mkt') -
col('eng'))).select('salary_diff')
# ans = abs(df_mkt['max(salary)'] - df_eng['max(salary)'])
# ans_df = pd.DataFrame([ans], ['salary'])

# To validate your solution, convert your final pySpark df to a pandas df


df2.toPandas()

2. Finding Updated Records

# Import your libraries


from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc,col

# Start writing code


win_spec = Window.partitionBy('first_name','last_name').orderBy(desc('salary'))
ms_employee_salary = ms_employee_salary.withColumn('ranks',
rank().over(win_spec)).filter(col('ranks') == 1).orderBy('id').drop('ranks').select('id',
'first_name','last_name', 'department_id', 'salary')

# To validate your solution, convert your final pySpark df to a pandas df


ms_employee_salary.toPandas()

3. Bikes Last Used


# Import your libraries

from pyspark.sql.functions import col, desc,rank

from pyspark.sql.window import Window

# Start writing code

win = Window.partitionBy('bike_number').orderBy(desc('end_time'))

dc_bikeshare_q1_2012 = dc_bikeshare_q1_2012.withColumn('rn', rank().over(win)).filter(col('rn')


==1).select('bike_number','end_time').sort(desc('end_time'))

# To validate your solution, convert your final pySpark df to a pandas df

dc_bikeshare_q1_2012.toPandas()

4. Reviews of Hotel Arena


# Import your libraries
from pyspark.sql.functions import col,count

# Start writing code


hotel_reviews = hotel_reviews.filter(col('hotel_name') == 'Hotel
Arena').groupBy('hotel_name','reviewer_score').agg(count('reviewer_score').alias('n_reviews')).
select('reviewer_score','hotel_name','n_reviews' )

# To validate your solution, convert your final pySpark df to a pandas df


hotel_reviews.toPandas()

5. Count the number of movies that Abigail Breslin nominated for Oscar
# Import your libraries
from pyspark.sql.functions import col, count,lower,countDistinct

# Start writing code


oscar_nominees = oscar_nominees.filter(col('nominee') == 'Abigail
Breslin').groupBy('nominee').agg(countDistinct('movie').alias('movie_cnt')).select('movie_cnt')
# oscar_nominees = oscar_nominees.filter(lower(col('nominee')).like('%abigail%'))
# To validate your solution, convert your final pySpark df to a pandas df
oscar_nominees.toPandas()

6. Find all posts which were reacted to with a heart


# Import your libraries
from pyspark.sql.functions import col

# Start writing code


facebook_reactions = facebook_reactions.filter(col('reaction') == 'heart')
facebook_posts = facebook_posts.join(facebook_reactions, facebook_posts['post_id'] ==
facebook_reactions['post_id'], 'inner').select(facebook_posts['*']).distinct()
# facebook_posts = facebook_posts.select('post_id', 'poster', 'post_text', 'post_keywords',
'post_date')

# To validate your solution, convert your final pySpark df to a pandas df


facebook_posts.toPandas()

7. Popularity of Hack
# Import your libraries
import pyspark

# Start writing code


df = facebook_employees.join(facebook_hack_survey,
facebook_employees['id']==facebook_hack_survey['employee_id'],'left')
df = df.groupBy('location').avg('popularity')

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

8. Lyft Driver Wages


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


lyft_drivers = lyft_drivers.filter((col('yearly_salary')<=30000) | (col('yearly_salary')>=70000))

# To validate your solution, convert your final pySpark df to a pandas df


lyft_drivers.toPandas()
9. Find how many times each artist appeared on the Spotify ranking list
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


spotify_worldwide_daily_song_ranking =
spotify_worldwide_daily_song_ranking.groupBy('artist').agg(count('*').alias('n_occurences')).sor
t(desc('n_occurences'))

# To validate your solution, convert your final pySpark df to a pandas df


spotify_worldwide_daily_song_ranking.toPandas()

10. Find the base pay for Police Captains


# Import your libraries
from pyspark.sql.functions import col

# Start writing code


sf_public_salaries = sf_public_salaries.filter(col('jobtitle') == "CAPTAIN III (POLICE
DEPARTMENT)").select('employeename','basepay')

# To validate your solution, convert your final pySpark df to a pandas df


sf_public_salaries.toPandas()

11. Find libraries who haven't provided the email address in circulation year 2016 but their notice
preference definition is set to email

# Import your libraries


from pyspark.sql.functions import *

# Start writing code


library_usage = library_usage.filter((col('notice_preference_definition') == 'email') &
(col('provided_email_address') == False) & (col('circulation_active_year') ==
'2016')).select('home_library_code').dropDuplicates()

# To validate your solution, convert your final pySpark df to a pandas df


library_usage.toPandas()
12. Average Salaries
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


win_spec = Window.partitionBy('department')
employee = employee.withColumn('avg_sal', avg('salary').over(win_spec)).select('department',
'first_name', 'salary','avg_sal')

# To validate your solution, convert your final pySpark df to a pandas df


employee.toPandas()

13. Order Details


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


customers = customers.filter(col('first_name').isin(['Jill','Eva']))
customers = customers.join(orders,customers['id'] == orders['cust_id'],'left')
# customers = customers.filter(col('first_name').isin(['Jill','Eva']))
customers =
customers.orderBy('cust_id').select('order_date','order_details','total_order_cost','first_name')

# To validate your solution, convert your final pySpark df to a pandas df


customers.toPandas()

14. Customer Details


# Import your libraries
import pyspark

# Start writing code


customers = customers.join(orders, customers['id'] == orders['cust_id'],'left')
customers =
customers.select('first_name','last_name','city','order_details').orderBy(['first_name','order_det
ails'])

# To validate your solution, convert your final pySpark df to a pandas df


customers.toPandas()
15. Number of Workers by Department Starting in April or Later
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


worker =
worker.filter(month('joining_date')>=4).groupBy('department').agg(countDistinct('worker_id'))

# To validate your solution, convert your final pySpark df to a pandas df


worker.toPandas()

16. Admin Department Employees Beginning in April or Later


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


worker = worker.filter(month('joining_date')
>=4).filter(col('department')=='Admin').agg(count('*'))

# To validate your solution, convert your final pySpark df to a pandas df


worker.toPandas()

17. Churro Activity Date


# Import your libraries
from pyspark.sql.functions import col

# Start writing code


los_angeles_restaurant_health_inspections =
los_angeles_restaurant_health_inspections.filter((col('facility_name') == 'STREET CHURROS') &
(col('score') < 95)).select('activity_date','pe_description')

# To validate your solution, convert your final pySpark df to a pandas df


los_angeles_restaurant_health_inspections.toPandas()

18. Find the most profitable company in the financial sector of the entire world along with its
continent
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


forbes_global_2010_2014 = forbes_global_2010_2014.filter(col('sector') ==
'Financials').orderBy(desc('profits')).select('company','continent').limit(1)
# To validate your solution, convert your final pySpark df to a pandas df
forbes_global_2010_2014.toPandas()

19. Count the number of user events performed by MacBookPro users


# Import your libraries
from pyspark.sql.functions import col,desc

# Start writing code


playbook_events = playbook_events.filter(col('device')=='macbook pro')
playbook_events = playbook_events.groupBy(col('event_name')).count().sort(desc('count'))

# To validate your solution, convert your final pySpark df to a pandas df


playbook_events.toPandas()

20. Number Of Bathrooms And Bedrooms


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


airbnb_search_details =
airbnb_search_details.groupBy('city','property_type').agg(avg('bedrooms').alias('n_bedrooms_a
vg'), avg('bathrooms').alias('n_bathrooms_avg'))

# To validate your solution, convert your final pySpark df to a pandas df


airbnb_search_details.toPandas()

21. Most Lucrative Products


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window
# Start writing code
online_orders = online_orders.withColumn('Revenue', col('cost_in_dollars')*col('units_sold'))
online_orders = online_orders.filter(year('date') == '2022').filter(month('date').between(1,6))

online_orders = online_orders.groupBy('product_id').agg(sum('Revenue').alias('Total'))
win_spec = Window.orderBy(desc('Total'))
online_orders = online_orders.withColumn('rnk',
dense_rank().over(win_spec)).filter(col('rnk')<6).drop('rnk')
# To validate your solution, convert your final pySpark df to a pandas df
online_orders.toPandas()
22. Number of Shipments Per Month
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


amazon_shipment = amazon_shipment.withColumn('Shipment', concat('shipment_id',
'sub_id')).withColumn('year_month', date_format('shipment_date', 'yyyy-
MM')).groupBy('year_month').agg(count('Shipment').alias('n_ship')).select('year_month',
'n_ship')

# To validate your solution, convert your final pySpark df to a pandas df


amazon_shipment.toPandas()

23. Unique Users Per Client Per Month


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


fact_events = fact_events.groupBy('client_id',month('time_id')).agg(countDistinct('user_id'))

# To validate your solution, convert your final pySpark df to a pandas df


fact_events.toPandas()

MEDIUM

1. Most Profitable Companies


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


wins = Window.orderBy(desc('profits'))
df = forbes_global_2010_2014.withColumn('rnk',
dense_rank().over(wins)).filter('rnk<4').select('company', 'profits')

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()
2. Activity Rank
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

win = Window.orderBy(desc('sent'), 'from_user')


# Start writing code
google_gmail_emails = google_gmail_emails.groupBy('from_user').agg(count('*').alias('sent'))
google_gmail_emails = google_gmail_emails.withColumn('rn', dense_rank().over(win))

# To validate your solution, convert your final pySpark df to a pandas df


google_gmail_emails.toPandas()

3. Finding User Purchases


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

wins = Window.partitionBy('user_id').orderBy('created_at')

# Start writing code


amazon_transactions = amazon_transactions.withColumn('next_purchase',
lead('created_at').over(wins))
df = amazon_transactions.withColumn('diff', datediff('next_purchase',
'created_at')).filter('diff<=7').select('user_id').distinct()

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

4. New Products
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df1 = car_launches.filter(col('year') ==
'2019').groupBy('company_name').agg(count('product_name').alias('p_19'))
df2 = car_launches.filter(col('year') ==
'2020').groupBy('company_name').agg(count('product_name').alias('p_20'))
df3 = df1.join(df2, ['company_name'],'inner').withColumn('net_p', col('p_20') -
col('p_19')).select('company_name', 'net_p')
car_launches
print(df1.count())
print(df2.count())
# To validate your solution, convert your final pySpark df to a pandas df
df3.toPandas()

5. Top Percentile Fraud


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

wins = Window.partitionBy('state').orderBy(desc('fraud_score'))
w2 = Window.partitionBy('state')

# Start writing code


df = fraud_score.withColumn('cnt', count('fraud_score').over(w2)) \
.withColumn('rnk', rank().over(wins)) \
.withColumn('pct', 1.0*col('rnk')/col('cnt')).filter('pct<0.06').select('policy_num',
'state','claim_cost','fraud_score').distinct()

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

6. Acceptance Rate By Date


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


request_sent = fb_friend_requests.filter(col('action')=='sent')
df = fb_friend_requests.filter(col('action')=='accepted').withColumnRenamed('date',
'accepted_date').withColumnRenamed('action', 'accepted')
df2 = df.join(request_sent, ['user_id_sender','user_id_receiver'], 'right').orderBy('date')
# df = fb_friend_requests.withColumn('act', when(col('action')=='sent', 0).otherwise(1))
df2 = df2.groupBy('date').agg(count('accepted_date').alias('accep'), count('action').alias('total'))
# fb_friend_requests
df2 = df2.withColumn('rate', 1.0*col('accep')/col('total')).select('date', 'rate')

# To validate your solution, convert your final pySpark df to a pandas df


df2.toPandas()
7. Popularity Percentage
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

wins = Window.orderBy('user1')

# Start writing code


# facebook_friendf_cnt = facebook_friends.withColumn('cnt', expr('count(distinct user1)'))
df = facebook_friends.select('*')

df_u = df.select('user1','user2').union(df.select('user2','user1')).select('user1',
'user2').toDF('user', 'frnd')
total = df_u.select('user').distinct().count()
df2 = df_u.groupBy('user').agg(countDistinct('frnd').alias('frnd'))
df2 = df2.withColumn('pct', round((100*col('frnd')/total),3)).sort('user').drop('frnd')

# To validate your solution, convert your final pySpark df to a pandas df


df2.toPandas()

8. Ranking Most Active Guests


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

wins = Window.orderBy(desc('n_messages'))
# Start writing code
df = airbnb_contacts.groupBy('id_guest').agg(sum('n_messages').alias('n_messages'))
df = df.withColumn('rnk', dense_rank().over(wins))

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

9. Spam Posts
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

win = Window.partitionBy('post_date').orderBy('post_date')
# Start writing code
facebook_posts = facebook_posts.join(facebook_post_views,['post_id'],'inner')
facebook_posts = facebook_posts.withColumn('spam',
when(col('post_keywords').contains('spam'),1).otherwise(0))

facebook_posts = facebook_posts.groupBy('post_date').agg(sum('spam').alias('spam_cnt'),
count('spam').alias('total'))

facebook_posts = facebook_posts.withColumn('spam_share',
100*col('spam_cnt')/col('total')).select('post_date','spam_share')

# To validate your solution, convert your final pySpark df to a pandas df


facebook_posts.toPandas()

10. Find the percentage of shipable orders


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


# tot = customers.count()
# customers = customers.filter(col('address').isNotNull())
orders = orders.join(customers, orders['cust_id'] == customers['id'], 'left')
tot = orders.count()

o2 = orders.filter(col('address').isNotNull()).count()
res = 100*o2/tot
res
# To validate your solution, convert your final pySpark df to a pandas df
# orders.toPandas()

11. Income By Title and Gender


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


sf_bonus = sf_bonus.groupBy('worker_ref_id').agg(sum('bonus').alias('bonus'))
sf_employee = sf_employee.join(sf_bonus, sf_employee['id']== sf_bonus['worker_ref_id'] ,
'right')
sf_employee = sf_employee.fillna({'bonus':0})
sf_employee = sf_employee.withColumn('total', (col('salary') + col('bonus')))
sf_employee = sf_employee.groupBy('employee_title','sex').agg(avg('total'))
# To validate your solution, convert your final pySpark df to a pandas df
sf_employee.toPandas()

12. Highest Energy Consumption


# Import your libraries
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.window import Window

# Start writing code


df_lst = [fb_asia_energy,fb_eu_energy,fb_na_energy]
df = reduce(DataFrame.unionAll, df_lst)

df = df.groupBy('date').agg(sum('consumption').alias('consumption'))

wins = Window.orderBy(desc('consumption'))
df = df.withColumn('rnk', dense_rank().over(wins)).filter(col('rnk')==1).drop('rnk')

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

13. Reviews of Categories


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = yelp_business.withColumn('categories', explode(split('categories', ';')))
df =
df.groupBy('categories').agg(sum('review_count').alias('total_reviews')).sort(desc('total_reviews'
))

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

14. Top 5 States With 5 Star Businesses


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window
wins = Window.orderBy(desc('start_bus'))

# Start writing code


df = yelp_business.filter('stars = 5')
df = df.groupBy('state').agg(count('business_id').alias('start_bus'))
df = df.withColumn('rnk', rank().over(wins)).filter('rnk<6').drop('rnk').orderBy(desc('start_bus'),
'state')

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

15. Find all wineries which produce wines by possessing aromas of plum, cherry, rose, or hazelnut
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df =
winemag_p1.filter(lower(col('description')).rlike(r'\bplum\b|\bcherry\b|\brose\b|\bhazelnut\b'
)).select('winery').distinct()
# To validate your solution, convert your final pySpark df to a pandas df
df.toPandas()

16. Highest Cost Orders


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


wins = Window.orderBy(desc('toc'))

df = customers.join(orders, customers['id'] == orders['cust_id'], 'left')


df = df.filter('order_date BETWEEN "2019-02-01" AND "2019-05-01"')
df = df.groupBy('first_name','order_date').agg(sum('total_order_cost').alias('toc'))

df = df.withColumn('rnk',dense_rank().over(wins)).filter('rnk=1').drop('rnk')

# Start writing code


# customers

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()
17. Highest Target Under Manager
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

wins = Window.orderBy(desc('target'))
# Start writing code
salesforce_employees = salesforce_employees.filter(col('manager_id') == 13)
salesforce_employees = salesforce_employees.withColumn('rnk',
rank().over(wins)).filter(col('rnk')==1).select('first_name','target')
# To validate your solution, convert your final pySpark df to a pandas df
salesforce_employees.toPandas()

18. Highest Salary In Department


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


wins = Window.partitionBy('department').orderBy(desc('salary'))
employee = employee.withColumn('rnk',
dense_rank().over(wins)).filter('rnk=1').select('department','first_name', 'salary')

# To validate your solution, convert your final pySpark df to a pandas df


employee.toPandas()

19. Employee and Manager Salaries


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = employee.alias('emp').join(employee.alias('mng'), col('emp.manager_id') == col('mng.id'),
'left').filter('emp.salary>mng.salary').select('emp.first_name', 'emp.salary')

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

20. Second Highest Salary


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


win_spec = Window.orderBy(desc('salary'))
employee = employee.withColumn('rnk',
dense_rank().over(win_spec)).filter(col('rnk')==2).select('salary')

# To validate your solution, convert your final pySpark df to a pandas df


employee.toPandas()

21. Find the number of times each word appears in drafts


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df =
google_file_store.filter(col('filename').like('%draft%')).agg(collect_list('contents').alias('contents'
))
df = df.withColumn('contents', concat_ws(' ', 'contents'))
df = df.withColumn('contents', regexp_replace('contents',r'[.|,]', ''))
df = df.withColumn('words', explode(split('contents', ' ')))
df = df.groupBy('words').agg(count('words').alias('words'))

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

22. Counting Instances in Text


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = google_file_store.agg(collect_list('contents').alias('contents'))
df = df.withColumn('contents', concat_ws(' ', 'contents'))
df = df.withColumn('contents', regexp_replace('contents',r'[.|,]', ''))
df = df.withColumn('words', explode(split('contents', ' ')))
df = df.groupBy('words').agg(count('words').alias('words_cnt'))
df = df.filter(col('words').isin('bull', 'bear'))

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()
23. Customer Revenue In March
# Import your libraries
from pyspark.sql.functions import *

# Start writing code


orders = orders.filter(date_format('order_date', 'yyyy-MM') == '2019-03')
orders =
orders.groupBy('cust_id').agg(sum('total_order_cost').alias('Revenue')).sort(desc('Revenue'))

# To validate your solution, convert your final pySpark df to a pandas df


orders.toPandas()

24. Find the rate of processed tickets for each type


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = facebook_complaints.withColumn("processed", col('processed').cast('integer'))
df = df.groupBy('type').agg(sum('processed').alias('process'),
count('processed').alias('total')).withColumn('rate', 1.0*col('process')/col('total')).select('type',
'rate')
# To validate your solution, convert your final pySpark df to a pandas df
df.toPandas()

25. Number of violations


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = sf_restaurant_health_violations.filter(col('business_name') == 'Roxanne
Cafe').withColumn('Year' , year('inspection_date'))
df = df.groupBy('Year').agg(count('violation_id').alias('cnt')).sort('Year')
# To validate your solution, convert your final pySpark df to a pandas df
df.toPandas()

26. Classify Business Type


# Import your libraries
from pyspark.sql.functions import *
# Start writing code
df = sf_restaurant_health_violations.withColumn('Business_type',
when(
(lower(col('business_name')).contains('restaurant')),'restaurant').when(
((lower(col('business_name')).contains('cafe'))
|(lower(col('business_name')).contains('café'))|(lower(col('business_name')).contains('coffee')))
,'cafe').when(
(lower(col('business_name')).contains('school')),'school').otherwise('other')
)

df = df.select('business_name','Business_type').distinct()

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

27. Find students with a median writing score

# Import your libraries


from pyspark.sql.functions import *
from pyspark.sql.window import Window

win = Window.orderBy('sat_writing')
df = sat_scores.withColumn('rn', row_number().over(win)).filter(col('rn') ==
'68').select('sat_writing').first()[0]
# Start writing code
sat_scores = sat_scores.filter(col('sat_writing').isin(df)).select(col('student_id'))
print(sat_scores.count())
# To validate your solution, convert your final pySpark df to a pandas df
sat_scores.toPandas()

28. User with Most Approved Flags


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


# flag_review = flag_review.filter(col('reviewed_outcome') == 'APPROVED')
user_flags = user_flags.join(flag_review, ['flag_id'], 'left').filter(col('reviewed_outcome') ==
'APPROVED')

user_flags = user_flags.withColumn('names', concat(col('user_firstname'), lit('


'),col('user_lastname'))).groupBy('names').agg(countDistinct('video_id').alias('fid')).withColumn('
rnk', dense_rank().over(Window.orderBy(desc('fid')))).filter(col('rnk')==1).select('names')
# user_flags = user_flags.filter((col('user_firstname')== 'Richard') & (col('user_lastname') ==
'Hasson'))
# To validate your solution, convert your final pySpark df to a pandas df
user_flags.toPandas()

29. Flags per Video


# Import your libraries
from pyspark.sql.functions import *

# Start writing code Ct6BUPvE2sM


df = user_flags.withColumn('name',concat_ws(' ', col('user_firstname'),col('user_lastname')))
df2 = df.withColumn('flg', when(col('flag_id').isNotNull(),1).otherwise(0)).filter(col('flg') == 1)

df2 = df2.groupBy('video_id').agg(countDistinct('name').alias('n_users'))
user_flags

# To validate your solution, convert your final pySpark df to a pandas df


df2.toPandas()

30. Election Results


# Import your libraries

from pyspark.sql.functions import *


from pyspark.sql.window import Window

win = Window.partitionBy('voter').orderBy('voter')
# Start writing code
voting_results = voting_results.filter(col('candidate').isNotNull())
df = voting_results.withColumn('rnk', row_number().over(win)).select('voter', 'rnk')
df = df.groupBy('voter').agg(max('rnk').alias('n_vote')).withColumn('votes', 1.0*1/col('n_vote'))

df2 =
df.join(voting_results,['voter'],'right').groupBy('candidate').agg(sum('votes').alias('total_votes')).
orderBy(desc('total_votes'))

win2 = Window.orderBy(desc('total_votes'))
df2 = df2.withColumn('rnk', dense_rank().over(win2)).filter('rnk<2').select('candidate')

# To validate your solution, convert your final pySpark df to a pandas df


df2.toPandas()
HARD

1. Monthly Percentage Difference


# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

wins = Window.orderBy('month_col')

# Start writing code


df = sf_transactions.withColumn('month_col', date_format('created_at', 'yyyy-MM'))
df = df.groupBy('month_col').agg(sum('value').alias('revenue')).sort('month_col')

df = df.withColumn('prev_rev', lag('revenue',1).over(wins))
df = df.withColumn('pct', round((100*(col('revenue') -
col('prev_rev'))/col('prev_rev')),2)).drop('revenue', 'prev_rev')

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

2. Premium vs Freemium
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Start writing code


df = ms_user_dimension.join(ms_acc_dimension, ['acc_id'], 'inner').join(ms_download_facts,
['user_id'], 'inner')

df = df.groupBy('date').pivot('paying_customer').sum('downloads')
df = df.filter(col('no')>col('yes'))

# To validate your solution, convert your final pySpark df to a pandas df


df.toPandas()

3. City With Most Amenities


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = airbnb_search_details.withColumn('amenities', regexp_replace('amenities', r'[{|}]', ''))
df = df.withColumn('amenities', explode(split('amenities',',')))
df = df.groupBy('city').agg(count('amenities').alias('city_cnt')).sort(desc('city_cnt'))
df2 = df.select(max('city_cnt')).collect()

df3 = df.filter(col('city_cnt') == df2[0][0]).select('city')

# To validate your solution, convert your final pySpark df to a pandas df


df3.toPandas()

4. Host Popularity Rental Prices


# Import your libraries
from pyspark.sql.functions import *

# Start writing code


df = airbnb_host_searches.fillna(0,['review_scores_rating'])
df = df.withColumn('host_id', concat('price', 'room_type', 'host_since', 'number_of_reviews'))
df = df.select('host_id', 'number_of_reviews', 'price').dropDuplicates()
df = df.withColumn('number_of_reviews', col('number_of_reviews').cast('int'))
df = df.withColumn('host_review_pop', when(col('number_of_reviews')==0,
'New').when(col('number_of_reviews').between(1,5),
'Rising').when(col('number_of_reviews').between(6,15), 'Trending
Up').when(col('number_of_reviews').between(16,40), 'Popular').otherwise('Hot'))

df2 =
df.groupBy('host_review_pop').agg(min('price').alias('min_price'),avg('price').alias('avg_price'),
max('price').alias('max_price'))
# df =
df.filter(col('review_scores_rating').between(6,15)).select('review_scores_rating').distinct()
# To validate your solution, convert your final pySpark df to a pandas df
df2.toPandas()

5. Retention Rate

# Import your libraries


from pyspark.sql.functions import *
from pyspark.sql.window import Window

# wins = Window.partitionBy('date_month', '')


# Start writing code
df = sf_events.withColumn('date_month', date_format('date', 'yyyy-MM'))
# df = df.filter()
df_dec = df.filter(col('date_month').like('%2020-12%')).select('account_id',
'user_id').dropDuplicates()
df_jan = df.filter(col('date_month').like('%2021-01%')).select('account_id',
'user_id').dropDuplicates()
df_feb = df.filter(col('date_month').like('%2021-02%')).select('account_id',
'user_id').dropDuplicates()
max_date = df.groupBy('user_id').agg(max('date_month').alias('max_date'))

df_dec2 = df_dec.join(max_date, ['user_id'], 'inner')


dec_ret = df_dec2.withColumn('retention', when(col('max_date')>'2020-12', 1).otherwise(0))
dec_ret = dec_ret.groupBy('account_id').agg(mean('retention').alias('dec_retention'))
df_jan2 = df_jan.join(max_date, ['user_id'], 'inner')
jan_ret = df_jan2.withColumn('retention', when(col('max_date')>'2021-01', 1).otherwise(0))
jan_ret = jan_ret.groupBy('account_id').agg(mean('retention').alias('jan_retention'))

df_retention = dec_ret.join(jan_ret, ['account_id'], 'inner')


df_retention2 = df_retention.withColumn('retention',
1.0*col('jan_retention')/col('dec_retention')).select('account_id', 'retention')

# To validate your solution, convert your final pySpark df to a pandas df


df_retention2.toPandas()

6. The Most Popular Client_Id Among Users Using Video and Voice Calls
# Import your libraries
from pyspark.sql.functions import *
from pyspark.sql.window import Window

event_type_msg = ['video call received', 'video call sent', 'voice call received', 'voice call sent']
# Start writing code
df = fact_events.withColumn('flag',
when(col('event_type').isin(event_type_msg),1).otherwise(0))
df2 = df.groupBy('user_id').agg(count('flag').alias('cnt_us'),sum('flag').alias('sum_us'))
# df = df.filter('flag = 1').groupBy('client_id').agg(count('user_id').alias('cnt_us'))
df2 = df2.withColumn('pct', 100*(1.0*col('sum_us')/col('cnt_us'))).filter('pct>=50')
final_df = fact_events.join(df2, ['user_id'], 'inner').select(df2['*'], fact_events.client_id)

final_df = final_df.groupBy('client_id').agg(count('*').alias('cnt'))
final_df2 = final_df.select(max('cnt')).collect()
finaldf = final_df.filter(col('cnt') == final_df2[0][0]).select('client_id')

# To validate your solution, convert your final pySpark df to a pandas df


finaldf.toPandas()

THANK YOU

You might also like