Target SQL Case Study
Target SQL Case Study
select
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'customers') csutom
ers,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'geolocation') geolo
cation,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'order_items') order
_items,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'orders') orders,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'order_reviews') ord
er_reviews,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'payments') paymen
ts,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'products') products
,
(select DDL from Target.INFORMATION_SCHEMA.TABLES where table_name = 'sellers') sellers
Result-
SELECT
MIN(DATE(ordr.order_purchase_timestamp)) AS start_date,
MAX(DATE(ordr.order_purchase_timestamp)) AS end_date
FROM
Target.orders AS ordr;
SELECT
DISTINCT(cust.customer_city) AS customer_city,
brzls.string_field_1 as customer_state_short_codes,
brzls.string_field_0 AS customer_states
FROM
Target.customers AS cust
JOIN
Target.Brazil_states AS brzls
ON
cust.customer_state = brzls.string_field_1
In-Depth Exploration
Seasonality and general Trend in Target’s sales as well as in the industry
Query 1 -> MOM Trend of Revenue for the Years 2017 and 2018
WITH
Trends AS (
SELECT
EXTRACT(YEAR
FROM
ord.order_purchase_timestamp) AS Years,
EXTRACT(MONTH
FROM
ord.order_purchase_timestamp) AS Months,
COUNT(ord.order_id) AS Total_Orders,
ROUND(SUM(pay.payment_value),2) AS Revenue
FROM
Target.orders AS ord
JOIN
Target.payments AS pay
ON
ord.order_id = pay.order_id
GROUP BY
Months,
Years
HAVING
Years in (2017,2018)
ORDER BY
Years,
Months)
SELECT
*,
DENSE_RANK() OVER(PARTITION BY Years ORDER BY Revenue DESC) AS mom_rank_re
venue_year_wise,
ifnull(round((100 * (Revenue/LAG(Revenue,1) OVER(PARTITION BY Years ORDER BY Mont
hs) - 1)),2),0) as percentage_change_revenue
FROM
Trends
ORDER BY Years,Months
Result-
QUERY 2 -> YOY Trend of Revenue for the months of Jan to Aug
WITH
Trends AS (
SELECT
EXTRACT(YEAR
FROM
ord.order_purchase_timestamp) AS Years,
EXTRACT(MONTH
FROM
ord.order_purchase_timestamp) AS Months,
COUNT(ord.order_id) AS Total_Orders,
ROUND(SUM(pay.payment_value),2) AS Revenue
FROM
Target.orders AS ord
JOIN
Target.payments AS pay
ON
ord.order_id = pay.order_id
GROUP BY
Months,
Years
HAVING
Years in (2017,2018)
ORDER BY
Years,
Months)
SELECT
round((select sum(Revenue) from temp1 where Years = 2017 and Months BETWEEN 1 and 8),
2) revenue_2017,
round((select sum(Revenue) from temp1 where Years = 2018 and Months BETWEEN 1 and 8),
2) revenue_2018,
round(100*((select sum(Revenue) from temp1 where Years = 2018 and Months BETWEEN 1 a
nd 8)/(select sum(Revenue) from Trends where Years = 2017 and Months BETWEEN 1 and 8)-
1),2) percentage_change_in_revenue
RESULT 2:-
We considered months 1-8 for this analysis as per the business requirement.
In-Depth Exploration
Problem Statement:-
[0 to 6] -Dawn
(6 to 12] - Morning
Result-
WITH
time_of_buying AS (
SELECT
ord.order_purchase_timestamp,
EXTRACT(HOUR
FROM
ord.order_purchase_timestamp) AS hour_of_buying
FROM
Target.orders ord)
SELECT
EXTRACT(YEAR
FROM
ord.order_purchase_timestamp) AS Years,
CASE
WHEN hour_of_buying >= 0 AND hour_of_buying <= 6 THEN 'DAWN'
WHEN hour_of_buying > 6
AND hour_of_buying <= 12 THEN 'MORNING'
WHEN hour_of_buying > 12 AND hour_of_buying <= 17 THEN 'AFTERNOON'
ELSE
'NIGHT'
END
AS period_of_day,
COUNT(ord.order_id) AS total_order_count
FROM
Target.orders ord
JOIN
time_of_buying tob
ON
ord.order_purchase_timestamp = tob.order_purchase_timestamp
GROUP BY
Years,
period_of_day
ORDER BY
Years,
total_order_count desc
with Target_1 as
(SELECT
EXTRACT(Year
FROM
ord.order_purchase_timestamp) AS Years,
EXTRACT(MONTH
FROM
ord.order_purchase_timestamp) AS Months,
bzs.string_field_0 AS state_name,
COUNT(ord.order_id) AS Mom_Orders_By_States
FROM
Target.orders ord
JOIN
Target.customers cust
ON
ord.customer_id = cust.customer_id
JOIN
Target.Brazil_states bzs
ON
cust.customer_state = bzs.string_field_1
GROUP BY
Years,
Months,
bzs.string_field_0
ORDER BY
Years,Months)
select
*,
DENSE_RANK() OVER(partition by Years,Months ORDER BY MOM_orders_by_states DE
SC) AS
Mom_by_no_of_orders
from Target_1
order by Years,Months,mom_by_no_of_orders
SELECT
cust.customer_state AS state_code,
bzs.string_field_0 state_name,
COUNT(DISTINCT cust.customer_unique_id) total_customers_by_state,
SUM(COUNT(DISTINCT cust.customer_unique_id)) OVER() AS Total_Customer_of_Brazil,
ROUND(COUNT(DISTINCT cust.customer_unique_id)/SUM(COUNT(DISTINCT cust.custome
r_unique_id)) OVER() * 100,2) AS contribution_percentage
FROM
Target.customers cust
JOIN
`Target.Brazil_states` AS bzs
ON
cust.customer_state = bzs.string_field_1
GROUP BY
cust.customer_state,
bzs.string_field_0
ORDER BY
contribution_percentage DESC
Impact on Economy: Analyze the money movement by e-commerce by looking at order prices,
freight and others.
- Percentage increase in cost of orders from 2017 to 2018 (including months between
Jan to Aug only)
- Through this analysis, we get to know the growth of the sales and revenue by the
company
WITH sum_of_2017 AS (
SELECT
EXTRACT(year
FROM
ord.order_purchase_timestamp) AS Years,
ROUND(SUM(pay.payment_value),2) AS cost_of_orders_2017
FROM
Target.orders ord
JOIN
Target.payments pay
ON
ord.order_id = pay.order_id
WHERE
EXTRACT(month
FROM
ord.order_purchase_timestamp) < 9
AND EXTRACT(year
FROM
ord.order_purchase_timestamp) = 2017
GROUP BY
Years ),
sum_of_2018 AS (
SELECT
EXTRACT(year
FROM
ord.order_purchase_timestamp) AS Years,
ROUND(SUM(pay.payment_value),2) AS cost_of_orders_2018
FROM
Target.orders ord
JOIN
Target.payments pay
ON
ord.order_id = pay.order_id
WHERE
EXTRACT(month
FROM
ord.order_purchase_timestamp) < 9
AND EXTRACT(year
FROM
ord.order_purchase_timestamp) = 2018
GROUP BY
Years)
SELECT
ROUND((
SELECT
cost_of_orders_2017
FROM
sum_of_2017),2) total_cost_2017,
ROUND((
SELECT
cost_of_orders_2018
FROM
sum_of_2018),2) total_cost_2018,
ROUND(100*((
SELECT
cost_of_orders_2018
FROM
sum_of_2018)/(
SELECT
cost_of_orders_2017
FROM
sum_of_2017) - 1),2) AS percentage_change_cost_of_orders
`Result-
--2) Mean & Sum of price and freight value by customer state
with Tempt as
(SELECT
bzs.string_field_1 AS state_code,
bzs.string_field_0 AS state_name,
ROUND(AVG(ordit.price),2) mean_price,
ROUND(AVG(ordit.freight_value),2) mean_freight_value,
COUNT(ordit.order_id) as no_of_orders
FROM
`Target.order_items` ordit
JOIN
Target.orders ord
ON
ordit.order_id = ord.order_id
JOIN
Target.customers cust
ON
ord.customer_id = cust.customer_id
JOIN
`Target.Brazil_states` bzs
ON
cust.customer_state = bzs.string_field_1
GROUP BY
bzs.string_field_1,
bzs.string_field_0
)
select
*,
DENSE_RANK() OVER(ORDER BY mean_price desc) as mean_price_by_state_rank,
DENSE_RANK() OVER(ORDER BY mean_freight_value asc) as mean_freight_value_by_state
_rank
from Tempt
SELECT
pay.payment_installments payment_installments,
COUNT(distinct pay.order_id) AS no_of_orders
FROM
Target.payments pay
GROUP BY
pay.payment_installments
ORDER BY
pay.payment_installments
Grouped data by state, with mean of freight_value, time_to_delivery,
diff_estimated_delivery
Top 5 states with highest/lowest average freight value - sort in desc/asc limit 5
Top 5 states where delivery is really fast/ not so fast compared to estimated date
WITH
delivery_data AS (
SELECT
ord.order_id AS order_id,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_purchase_timestamp),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS time_to_delivery,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_estimated_delivery_date),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS diff_estimated_delivery
FROM
Target.orders ord)
SELECT
bzs.string_field_1 AS state_code,
bzs.string_field_0 AS state_name,
ROUND(AVG(ordit.freight_value),2) mean_freight_value,
ROUND(AVG(time_to_delivery),2) mean_time_to_delivery,
ROUND(AVG(diff_estimated_delivery),2) mean_diff_estimated_delivery
FROM
`Target.order_items` ordit
JOIN
Target.orders ord
ON
ordit.order_id = ord.order_id
JOIN
Target.customers cust
ON
ord.customer_id = cust.customer_id
JOIN
`Target.Brazil_states` bzs
ON
WITH
delivery_data AS (
SELECT
ord.order_id AS order_id,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_purchase_timestamp),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS time_to_delivery,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_estimated_delivery_date),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS diff_estimated_delivery
FROM
Target.orders ord)
SELECT
bzs.string_field_1 AS state_code,
bzs.string_field_0 AS state_name,
ROUND(AVG(ordit.freight_value),2) mean_freight_value,
ROUND(AVG(time_to_delivery),2) mean_time_to_delivery,
ROUND(AVG(diff_estimated_delivery),2) mean_diff_estimated_delivery
FROM
`Target.order_items` ordit
JOIN
Target.orders ord
ON
ordit.order_id = ord.order_id
JOIN
Target.customers cust
ON
ord.customer_id = cust.customer_id
JOIN
`Target.Brazil_states` bzs
ON
cust.customer_state = bzs.string_field_1
JOIN
delivery_data dvd
ON
ord.order_id = dvd.order_id
GROUP BY
bzs.string_field_1,
bzs.string_field_0
ORDER BY
mean_time_to_delivery desc
LIMIT 5
QUERY 5]6] – Top 5 states with highest average time to delivery (Also, top 5 states where deliver is
really fast)
WITH
delivery_data AS (
SELECT
ord.order_id AS order_id,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_purchase_timestamp),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS time_to_delivery,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_estimated_delivery_date),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS diff_estimated_delivery
FROM
Target.orders ord)
SELECT
bzs.string_field_1 AS state_code,
bzs.string_field_0 AS state_name,
ROUND(AVG(ordit.freight_value),2) mean_freight_value,
ROUND(AVG(time_to_delivery),2) mean_time_to_delivery,
ROUND(AVG(diff_estimated_delivery),2) mean_diff_estimated_delivery
FROM
`Target.order_items` ordit
JOIN
Target.orders ord
ON
ordit.order_id = ord.order_id
JOIN
Target.customers cust
ON
ord.customer_id = cust.customer_id
JOIN
`Target.Brazil_states` bzs
ON
cust.customer_state = bzs.string_field_1
JOIN
delivery_data dvd
ON
ord.order_id = dvd.order_id
GROUP BY
bzs.string_field_1,
bzs.string_field_0
ORDER BY
mean_time_to_delivery desc
LIMIT 5
QUERY 4 - Top 5 states with highest average time to delivery (Also, top 5 states where delivery is
really slow)
WITH
delivery_data AS (
SELECT
ord.order_id AS order_id,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_purchase_timestamp),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS time_to_delivery,
DATETIME_DIFF(EXTRACT(date
FROM
ord.order_estimated_delivery_date),EXTRACT(date
FROM
ord.order_delivered_customer_date), day) AS diff_estimated_delivery
FROM
Target.orders ord)
SELECT
bzs.string_field_1 AS state_code,
bzs.string_field_0 AS state_name,
ROUND(AVG(ordit.freight_value),2) mean_freight_value,
ROUND(AVG(time_to_delivery),2) mean_time_to_delivery,
ROUND(AVG(diff_estimated_delivery),2) mean_diff_estimated_delivery
FROM
`Target.order_items` ordit
JOIN
Target.orders ord
ON
ordit.order_id = ord.order_id
JOIN
Target.customers cust
ON
ord.customer_id = cust.customer_id
JOIN
`Target.Brazil_states` bzs
ON cust.customer_state = bzs.string_field_1
JOIN
delivery_data dvd
ON
ord.order_id = dvd.order_id
GROUP BY
bzs.string_field_1,
bzs.string_field_0
ORDER BY
mean_time_to_delivery
LIMIT 5
WITH
temp1 AS (
SELECT
EXTRACT(Year
FROM
ord.order_purchase_timestamp) AS Years,
EXTRACT(MONTH
FROM
ord.order_purchase_timestamp) AS Months,
pay.payment_type,
COUNT( DISTINCT ord.order_id) AS no_of_payments
FROM
Target.orders ord
JOIN
Target.payments pay
ON
ord.order_id = pay.order_id
GROUP BY
pay.payment_type,
Months,
Years
ORDER BY
Years,
Months,
pay.payment_type)
SELECT
*,
DENSE_RANK() OVER(PARTITION BY Years, Months ORDER BY no_of_payments DES
C) rank_by_no_of_payments
FROM
temp1
ORDER BY
Years,Months