M5 - Custom Model Building With SQL in BigQuery ML Slides
M5 - Custom Model Building With SQL in BigQuery ML Slides
Supported Models
BigQuery ML is a way to build custom models
Build a Custom Build Custom Call a Pretrained Model
Model Model (codeless)
1 Techcrunch
2 GitHub
3 NY Times
SQL query to extract data
*no clusters, no
SELECT
url, title indexes, ad hoc query!
FROM
`bigquery-public-data.hacker_news.stories`
WHERE
LENGTH(title) > 10
AND LENGTH(url) > 0
LIMIT 10
Use regex to get source + train on words of title
https://console.cloud.google.com/bigquery?sq=711916710713:47df84978c64458ea04b3cb4ae5de878
Create model CREATE OR REPLACE MODEL advdata.txtclass
OPTIONS(model_type='logistic_reg',
input_label_cols=['source'])
AS
Query to extract
training data
WITH extracted AS (
...
)
, ds AS (
SELECT ARRAY_CONCAT(SPLIT(title, " "), ['NULL', 'NULL',
'NULL', 'NULL', 'NULL']) AS words, source FROM extracted
WHERE (source = 'github' OR source = 'nytimes' OR source
= 'techcrunch')
)
SELECT
source,
words[OFFSET(0)] AS word1,
words[OFFSET(1)] AS word2,
words[OFFSET(2)] AS word3,
words[OFFSET(3)] AS word4,
words[OFFSET(4)] AS word5
FROM ds
Evaluate model
SELECT * FROM ML.EVALUATE(MODEL advdata.txtclass)
(BQML splits the training data and reports evaluation statistics on the
held-out set)
Predict using trained model
SELECT * FROM ML.PREDICT(MODEL advdata.txtclass,(
SELECT 'government' AS word1, 'shutdown' AS word2, 'leaves'
AS word3, 'workers' AS word4, 'reeling' AS word5
UNION ALL SELECT 'unlikely', 'partnership', 'in', 'house',
'gives'
UNION ALL SELECT 'fitbit', 's', 'fitness', 'tracker', 'is'
UNION ALL SELECT 'downloading', 'the', 'android', 'studio',
'project'
))
https://console.cloud.google.com/bigquery?sq=663413318684:4d854a43
ae93416eaeb349e1fc4888cb
Demo: Train a model with BigQuery ML to
predict NYC taxi fares
Agenda
BigQuery ML for Quick Model
Building
Supported Models
Linear Classifier (Logistic regression)
DNN Classifier (alpha)
xgboost Classifier (alpha)
Linear Regression
DNN Regression (alpha)
xgboost Regression (alpha)
Train on TF, predict with BigQuery
CREATE OR REPLACE MODEL advdata.txtclass_tf2
OPTIONS (model_type='tensorflow',
model_path='gs://cloud-training-demos-ml/txtcls/trained_finetune_native
/export/exporter/1549825580/*')
SELECT
input,
(SELECT AS STRUCT(p, ['github', 'nytimes', 'techcrunch'][ORDINAL(s)])
prediction FROM
(SELECT p, ROW_NUMBER() OVER() AS s FROM
(SELECT * FROM UNNEST(dense_1) AS p))
ORDER BY p DESC LIMIT 1).*
with purchases AS (
select product_id, user_id from
operations.orders_with_lines, unnest(order_lines)
),
total_purchases as (
select product_id, user_id, count(*) as numtimes
from purchases
group by product_id, user_id
)
select
product_id, user_id,
IF(numtimes < 2, 1, 2) AS rating
FROM total_purchases
So what do we recommend for a given set of users?
with users AS (
SELECT
user_id, count(*) as num_orders
from operations.orders_with_lines
group by user_id
order by num_orders desc
limit 10
),
products as (
select product_id, count(*) as num_orders
from operations.orders_with_lines, unnest(order_lines)
group by product_id
order by num_orders desc
limit 10
)
SELECT * except(nearest_centroids_distance)
FROM ML.PREDICT(MODEL
demos_eu.london_station_clusters,
(SELECT * FROM stationstats WHERE
REGEXP_CONTAINS(station_name, 'Kennington')))
Find cluster attributes
WITH T AS (
SELECT
centroid_id,
ARRAY_AGG(STRUCT(numerical_feature AS name, ROUND(feature_value,1)
AS value) ORDER BY centroid_id) AS cluster
FROM ML.CENTROIDS(MODEL demos_eu.london_station_clusters)
GROUP BY centroid_id
)
SELECT
CONCAT('Cluster#', CAST(centroid_id AS STRING)) AS centroid,
(SELECT value from unnest(cluster) WHERE name = 'duration') AS
duration,
(SELECT value from unnest(cluster) WHERE name = 'num_trips') AS
num_trips,
(SELECT value from unnest(cluster) WHERE name = 'bikes_count') AS
bikes_count,
(SELECT value from unnest(cluster) WHERE name =
'distance_from_city_center') AS distance_from_city_center
FROM T
ORDER BY centroid_id ASC
Visualize attributes in Data Studio ...
Use the transform clause
Same Deploy
Clients Model
serving
TRANSFORM ensures transformations are
automatically applied during ML.PREDICT
CREATE OR REPLACE MODEL ch09edu.bicycle_model CREATE OR REPLACE MODEL ch09edu.bicycle_model
OPTIONS(input_label_cols=['duration'], OPTIONS(input_label_cols=['duration'],
model_type='linear_reg') model_type='linear_reg')
AS TRANSFORM(
SELECT * EXCEPT(start_date)
SELECT , CAST(EXTRACT(dayofweek from start_date) AS STRING)
duration as dayofweek
, start_station_name , CAST(EXTRACT(hour from start_date) AS STRING)
, CAST(EXTRACT(dayofweek from start_date) AS STRING) as hourofday
as dayofweek )
, CAST(EXTRACT(hour from start_date) AS STRING) AS
as hourofday SELECT
FROM duration, start_station_name, start_date
`bigquery-public-data.london_bicycles.cycle_hire` FROM
`bigquery-public-data.london_bicycles.cycle_hire`
SELECT * FROM ML.PREDICT(MODEL ch09edu.bicycle_model,(
350 AS duration
SELECT * FROM ML.PREDICT(MODEL ch09edu.bicycle_model,(
, 'Kings Cross' AS start_station_name
350 AS duration
, '3' as dayofweek
, 'Kings Cross' AS start_station_name
, '18' as hourofday
, CURRENT_TIMESTAMP() as start_date
))
))
Reminder: BigQuery ML Cheatsheet
● Label = alias a column as ‘label’ or specify column in OPTIONS using input_label_cols
● Feature = passed through to the model as part of your SQL SELECT statement
SELECT * FROM ML.FEATURE_INFO(MODEL `mydataset.mymodel`)
● Query and explore the London bicycles dataset for feature engineering
● Create a linear regression model in BQML
● Evaluate the performance of your machine learning model
● Extract your model weights
Lab
Movie Recommendations in
BigQuery ML
Objectives