From 4cf9a0ef9889e932aea9fb18c71b1226a8c0bd7b Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 31 Jan 2024 15:25:29 -0600 Subject: [PATCH 1/6] docs: Add a sample to demonstrate the evaluation results --- samples/snippets/bqml_getting_started_test.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 783f963feb..14e7a3eb45 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -91,3 +91,50 @@ def test_bqml_getting_started(random_model_id): replace=True, ) # [END bigquery_dataframes_bqml_getting_started_tutorial] + + # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] + import bigframes.pandas as bpd + + # WHAT IS READ_GBQ DOING?! + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) + + # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' — + # limits the number of tables scanned by the query. The date range scanned is + # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance + # of the model. It was collected in the month immediately following the time + # period spanned by the training data. + + df = bpd.read_gbq( + """ + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' + """, + index_col="rowindex", + ) + transactions = df["totals"].struct.field("transactions") + label = transactions.notnull().map({True: 1, False: 0}) + operatingSystem = df["device"].struct.field("operatingSystem") + operatingSystem = operatingSystem.fillna("") + isMobile = df["device"].struct.field("isMobile") + country = df["geoNetwork"].struct.field("country").fillna("") + pageviews = df["totals"].struct.field("pageviews").fillna(0) + features = bpd.DataFrame( + { + "os": operatingSystem, + "is_mobile": isMobile, + "country": country, + "pageviews": pageviews, + } + ) + + # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric: + model.score(features, label) + # precision recall accuracy f1_score log_loss roc_auc + # 0 0.412621 0.079143 0.985074 0.132812 0.049764 0.974285 + # [1 rows x 6 columns] + # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] From ffcf185b48edda796f2962716f516d96deb10d50 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 1 Feb 2024 12:11:49 -0600 Subject: [PATCH 2/6] Adding comments explaining logistic regression results --- samples/snippets/bqml_getting_started_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 14e7a3eb45..7767e1c484 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -133,6 +133,22 @@ def test_bqml_getting_started(random_model_id): ) # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric: + + # Because you performed a logistic regression, the results include the following columns: + # precision — A metric for classification models. Precision identifies the frequency with + # which a model was correct when predicting the positive class. + # recall — A metric for classification models that answers the following question: + # Out of all the possible positive labels, how many did the model correctly identify? + # accuracy — Accuracy is the fraction of predictions that a classification model got right. + # f1_score — A measure of the accuracy of the model. The f1 score is the harmonic average of + # the precision and recall. An f1 score's best value is 1. The worst value is 0. + # log_loss — The loss function used in a logistic regression. This is the measure of how far the + # model's predictions are from the correct labels. + # roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that + # a randomly chosen positive example + # is actually positive than that a randomly chosen negative example is positive. For more information, + # see Classification in the Machine Learning Crash Course. + model.score(features, label) # precision recall accuracy f1_score log_loss roc_auc # 0 0.412621 0.079143 0.985074 0.132812 0.049764 0.974285 From 8e5ba68172cde07f9c55b55a3ef6e0942104853a Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 5 Feb 2024 10:22:06 -0600 Subject: [PATCH 3/6] editing read_gbd explanation --- samples/snippets/bqml_getting_started_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 7767e1c484..dfb46103bb 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -95,7 +95,8 @@ def test_bqml_getting_started(random_model_id): # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] import bigframes.pandas as bpd - # WHAT IS READ_GBQ DOING?! + # Select model you'll use for training. 'read_gbq' accepts either a SQL query + # or a table ID. model = bpd.read_gbq_model( your_model_id, # For example: "bqml_tutorial.sample_model", ) From 939e223ed455ddf7266491cf4c4175461e955fa2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 5 Feb 2024 14:02:26 -0600 Subject: [PATCH 4/6] Creating link for ML course --- samples/snippets/bqml_getting_started_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index dfb46103bb..6f39881a8e 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -148,10 +148,15 @@ def test_bqml_getting_started(random_model_id): # roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that # a randomly chosen positive example # is actually positive than that a randomly chosen negative example is positive. For more information, - # see Classification in the Machine Learning Crash Course. + # see ['Classification']('https://developers.google.com/machine-learning/crash-course/classification/video-lecture') + # in the Machine Learning Crash Course. model.score(features, label) # precision recall accuracy f1_score log_loss roc_auc # 0 0.412621 0.079143 0.985074 0.132812 0.049764 0.974285 # [1 rows x 6 columns] # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] + + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict] + + # [END bigquery_dataframes_bqml_getting_started_tutorial_predict] From 832356a5fc9f7866380a9164b573d1331fa98ac6 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 5 Feb 2024 14:08:03 -0600 Subject: [PATCH 5/6] Formatting metrics --- samples/snippets/bqml_getting_started_test.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 6f39881a8e..ba0b6d1e24 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -136,16 +136,22 @@ def test_bqml_getting_started(random_model_id): # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric: # Because you performed a logistic regression, the results include the following columns: - # precision — A metric for classification models. Precision identifies the frequency with + + # - precision — A metric for classification models. Precision identifies the frequency with # which a model was correct when predicting the positive class. - # recall — A metric for classification models that answers the following question: + + # - recall — A metric for classification models that answers the following question: # Out of all the possible positive labels, how many did the model correctly identify? - # accuracy — Accuracy is the fraction of predictions that a classification model got right. - # f1_score — A measure of the accuracy of the model. The f1 score is the harmonic average of + + # - accuracy — Accuracy is the fraction of predictions that a classification model got right. + + # - f1_score — A measure of the accuracy of the model. The f1 score is the harmonic average of # the precision and recall. An f1 score's best value is 1. The worst value is 0. - # log_loss — The loss function used in a logistic regression. This is the measure of how far the + + # - log_loss — The loss function used in a logistic regression. This is the measure of how far the # model's predictions are from the correct labels. - # roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that + + # - roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that # a randomly chosen positive example # is actually positive than that a randomly chosen negative example is positive. For more information, # see ['Classification']('https://developers.google.com/machine-learning/crash-course/classification/video-lecture') From 6683e2c1f570430221cb978a6669f7d3ccb71700 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 5 Feb 2024 16:53:52 -0600 Subject: [PATCH 6/6] Update samples/snippets/bqml_getting_started_test.py --- samples/snippets/bqml_getting_started_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index ba0b6d1e24..bb282fa563 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -95,8 +95,8 @@ def test_bqml_getting_started(random_model_id): # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] import bigframes.pandas as bpd - # Select model you'll use for training. 'read_gbq' accepts either a SQL query - # or a table ID. + # Select model you'll use for training. `read_gbq_model` loads model data from a + # BigQuery, but you could also use the `model` object from the previous steps. model = bpd.read_gbq_model( your_model_id, # For example: "bqml_tutorial.sample_model", )