やりたいこと
BigQuery に GA4 のデータが蓄積されている状態で、以下のように user_id に null がある場合、同じ user_pseudo_id であれば、その直近の過去の user_id で置換をしたい。
実現方法
FIRST_VALUE、または LAST_VALUE を使用して以下のクエリで実現できた。
上記のsampleテーブル作成用クエリ
WITH sample AS (
SELECT
null AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 10:00:00' AS event_datetime
UNION ALL
SELECT
null AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 11:00:00' AS event_datetime
UNION ALL
SELECT
'A' AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 12:00:00' AS event_datetime
UNION ALL
SELECT
null AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 13:00:00' AS event_datetime
UNION ALL
SELECT
null AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 14:00:00' AS event_datetime
UNION ALL
SELECT
'B' AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 15:00:00' AS event_datetime
UNION ALL
SELECT
null AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 16:00:00' AS event_datetime
UNION ALL
SELECT
null AS user_id
, '123456789' AS user_pseudo_id
, '2022-08-03 17:00:00' AS event_datetime
)
SELECT * FROM sample
SELECT
user_id
, FIRST_VALUE(user_id IGNORE NULLS)
OVER (
PARTITION BY user_pseudo_id
ORDER BY event_datetime ASC
ROWS BETWEEN 0 PRECEDING AND UNBOUNDED FOLLOWING
) AS most_recent_user_id_first_val
, LAST_VALUE(user_id IGNORE NULLS)
OVER (
PARTITION BY user_pseudo_id
ORDER BY event_datetime DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS most_recent_user_id_last_val
, user_pseudo_id
, event_datetime
FROM sample
ORDER BY event_datetime
user_id | most_recent_user_id_first_val | most_recent_user_id_last_val | user_pseudo_id | event_datetime |
---|---|---|---|---|
null | A | A | 123456789 | 2023/05/01 10:00:00 |
null | A | A | 123456789 | 2023/05/01 11:00:00 |
A | A | A | 123456789 | 2023/05/01 12:00:00 |
null | B | B | 123456789 | 2023/05/01 13:00:00 |
null | B | B | 123456789 | 2023/05/01 14:00:00 |
B | B | B | 123456789 | 2023/05/01 15:00:00 |
null | null | null | 123456789 | 2023/05/01 16:00:00 |
null | null | null | 123456789 | 2023/05/01 17:00:00 |
参考