From db8cf2d234869e7e3dcf1dd2a77afccdb0a28a39 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 26 Sep 2024 15:51:21 -0500 Subject: [PATCH 01/10] feat: add `bigframes.bigquery.create_vector_index` to assist in creating vector index on `ARRAY` columns --- bigframes/bigquery/__init__.py | 501 +----------------- bigframes/bigquery/_operations/__init__.py | 13 + bigframes/bigquery/_operations/approx_agg.py | 70 +++ bigframes/bigquery/_operations/array.py | 150 ++++++ bigframes/bigquery/_operations/json.py | 145 +++++ bigframes/bigquery/_operations/search.py | 244 +++++++++ bigframes/bigquery/_operations/struct.py | 70 +++ bigframes/core/sql.py | 39 +- tests/system/conftest.py | 5 + .../small/bigquery/test_vector_search.py | 76 +++ 10 files changed, 830 insertions(+), 483 deletions(-) create mode 100644 bigframes/bigquery/_operations/__init__.py create mode 100644 bigframes/bigquery/_operations/approx_agg.py create mode 100644 bigframes/bigquery/_operations/array.py create mode 100644 bigframes/bigquery/_operations/json.py create mode 100644 bigframes/bigquery/_operations/search.py create mode 100644 bigframes/bigquery/_operations/struct.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 28a818e709..9953a946d8 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -12,489 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. - """This module integrates BigQuery built-in functions for use with DataFrame objects, such as array functions: https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ - -from __future__ import annotations - -import typing -from typing import Literal, Optional, Union - -import bigframes_vendored.constants as constants - -import bigframes.core.groupby as groupby -import bigframes.core.sql -import bigframes.ml.utils as utils -import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops -import bigframes.series - -if typing.TYPE_CHECKING: - import bigframes.dataframe as dataframe - import bigframes.series as series - - -# Array functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions - - -def array_length(series: series.Series) -> series.Series: - """Compute the length of each array element in the Series. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) - >>> bbq.array_length(s) - 0 4 - 1 0 - 2 2 - dtype: Int64 - - You can also apply this function directly to Series. - - >>> s.apply(bbq.array_length, by_row=False) - 0 4 - 1 0 - 2 2 - dtype: Int64 - - Args: - series (bigframes.series.Series): A Series with array columns. - - Returns: - bigframes.series.Series: A Series of integer values indicating - the length of each element in the Series. - - """ - return series._apply_unary_op(ops.len_op) - - -def array_agg( - obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, -) -> series.Series | dataframe.DataFrame: - """Group data and create arrays from selected columns, omitting NULLs to avoid - BigQuery errors (NULLs not allowed in arrays). - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - - For a SeriesGroupBy object: - - >>> lst = ['a', 'a', 'b', 'b', 'a'] - >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst) - >>> bbq.array_agg(s.groupby(level=0)) - a [1. 2.] - b [3. 4.] - dtype: list[pyarrow] - - For a DataFrameGroupBy object: - - >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - >>> df = bpd.DataFrame(l, columns=["a", "b", "c"]) - >>> bbq.array_agg(df.groupby(by=["b"])) - a c - b - 1.0 [2] [3] - 2.0 [1 1] [3 2] - - [2 rows x 2 columns] - - Args: - obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): - A GroupBy object to be applied the function. - - Returns: - bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or - DataFrame containing aggregated array columns, and indexed by the - original group columns. - """ - if isinstance(obj, groupby.SeriesGroupBy): - return obj._aggregate(agg_ops.ArrayAggOp()) - elif isinstance(obj, groupby.DataFrameGroupBy): - return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False) - else: - raise ValueError( - f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" - ) - - -def array_to_string(series: series.Series, delimiter: str) -> series.Series: - """Converts array elements within a Series into delimited strings. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) - >>> bbq.array_to_string(s, delimiter=", ") - 0 H, i, ! - 1 Hello, World - 2 - 3 - 4 Hi - dtype: string - - Args: - series (bigframes.series.Series): A Series containing arrays. - delimiter (str): The string used to separate array elements. - - Returns: - bigframes.series.Series: A Series containing delimited strings. - - """ - return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) - - -# JSON functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions - - -def json_set( - series: series.Series, - json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], -) -> series.Series: - """Produces a new JSON value within a Series by inserting or replacing values at - specified paths. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] - >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) - 0 {"a":100,"b":"hi"} - Name: data, dtype: string - - Args: - series (bigframes.series.Series): - The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path_value_pairs (Sequence[Tuple[str, typing.Any]]): - Pairs of JSON path and the new value to insert/replace. - - Returns: - bigframes.series.Series: A new Series with the transformed JSON data. - - """ - # SQLGlot parser does not support the "create_if_missing => true" syntax, so - # create_if_missing is not currently implemented. - - for json_path_value_pair in json_path_value_pairs: - if len(json_path_value_pair) != 2: - raise ValueError( - "Incorrect format: Expected (, ), but found: " - + f"{json_path_value_pair}" - ) - - json_path, json_value = json_path_value_pair - series = series._apply_binary_op( - json_value, ops.JSONSet(json_path=json_path), alignment="left" - ) - return series - - -def json_extract( - series: series.Series, - json_path: str, -) -> series.Series: - """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` - value. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) - >>> bbq.json_extract(s, json_path="$.class") - 0 {"students":[{"id":5},{"id":12}]} - dtype: string - - Args: - series (bigframes.series.Series): - The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path (str): - The JSON path identifying the data that you want to obtain from the input. - - Returns: - bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. - """ - return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) - - -def json_extract_array( - series: series.Series, - json_path: str = "$", -) -> series.Series: - """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON` - values. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) - >>> bbq.json_extract_array(s) - 0 ['1' '2' '3'] - 1 ['4' '5'] - dtype: list[pyarrow] - - Args: - series (bigframes.series.Series): - The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path (str): - The JSON path identifying the data that you want to obtain from the input. - - Returns: - bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. - """ - return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) - - -# Approximate aggrgate functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions - - -def approx_top_count( - series: series.Series, - number: int, -) -> series.Series: - """Returns the approximate top elements of `expression` as an array of STRUCTs. - The number parameter specifies the number of elements returned. - - Each `STRUCT` contains two fields. The first field (named `value`) contains an input - value. The second field (named `count`) contains an `INT64` specifying the number - of times the value was returned. - - Returns `NULL` if there are zero input rows. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) - >>> bbq.approx_top_count(s, number=2) - [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] - - Args: - series (bigframes.series.Series): - The Series with any data type that the `GROUP BY` clause supports. - number (int): - An integer specifying the number of times the value was returned. - - Returns: - bigframes.series.Series: A new Series with the result data. - """ - if number < 1: - raise ValueError("The number of approx_top_count must be at least 1") - return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) - - -def struct(value: dataframe.DataFrame) -> series.Series: - """Takes a DataFrame and converts it into a Series of structs with each - struct entry corresponding to a DataFrame row and each struct field - corresponding to a DataFrame column - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import bigframes.series as series - >>> bpd.options.display.progress_bar = None - - >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) - >>> df = srs.struct.explode() - >>> bbq.struct(df) - 0 {'project': 'pandas', 'version': 1} - 1 {'project': 'numpy', 'version': 2} - dtype: struct[pyarrow] - - Args: - value (bigframes.dataframe.DataFrame): - The DataFrame to be converted to a Series of structs - - Returns: - bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame - """ - block = value._block - block, result_id = block.apply_nary_op( - block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) - ) - block = block.select_column(result_id) - return bigframes.series.Series(block) - - -# Search functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions - - -def vector_search( - base_table: str, - column_to_search: str, - query: Union[dataframe.DataFrame, series.Series], - *, - query_column_to_search: Optional[str] = None, - top_k: Optional[int] = 10, - distance_type: Literal["euclidean", "cosine"] = "euclidean", - fraction_lists_to_search: Optional[float] = None, - use_brute_force: bool = False, -) -> dataframe.DataFrame: - """ - Conduct vector search which searches embeddings to find semantically similar entities. - - **Examples:** - - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column - is used as the search query: - - >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], - ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) - >>> bbq.vector_search( - ... base_table="bigframes-dev.bigframes_tests_sys.base_table", - ... column_to_search="my_embedding", - ... query=search_query, - ... top_k=2) - query_id embedding id my_embedding distance - 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 - 0 dog [1. 2.] 1 [1. 2.] 0.0 - 0 dog [1. 2.] 4 [1. 3.2] 1.2 - 1 cat [3. 5.2] 2 [2. 4.] 1.56205 - - [4 rows x 5 columns] - - Series embeddings for which to find nearest neighbors: - - >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], - ... index=["dog", "cat"], - ... name="embedding") - >>> bbq.vector_search( - ... base_table="bigframes-dev.bigframes_tests_sys.base_table", - ... column_to_search="my_embedding", - ... query=search_query, - ... top_k=2) - embedding id my_embedding distance - dog [1. 2.] 1 [1. 2.] 0.0 - cat [3. 5.2] 5 [5. 5.4] 2.009975 - dog [1. 2.] 4 [1. 3.2] 1.2 - cat [3. 5.2] 2 [2. 4.] 1.56205 - - [4 rows x 4 columns] - - You can specify the name of the column in the query DataFrame embeddings and distance type. - If you specify query_column_to_search_value, it will use the provided column which contains - the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. - - >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], - ... "embedding": [[1.0, 2.0], [3.0, 5.2]], - ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) - >>> bbq.vector_search( - ... base_table="bigframes-dev.bigframes_tests_sys.base_table", - ... column_to_search="my_embedding", - ... query=search_query, - ... distance_type="cosine", - ... query_column_to_search="another_embedding", - ... top_k=2) - query_id embedding another_embedding id my_embedding distance - 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 - 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 - 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 - 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 - - [4 rows x 6 columns] - - Args: - base_table (str): - The table to search for nearest neighbor embeddings. - column_to_search (str): - The name of the base table column to search for nearest neighbor embeddings. - The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. - query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): - A Series or DataFrame that provides the embeddings for which to find nearest neighbors. - query_column_to_search (str): - Specifies the name of the column in the query that contains the embeddings for which to - find nearest neighbors. The column must have a type of ``ARRAY``. All elements in - the array must be non-NULL and all values in the column must have the same array dimensions - as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. - top_k (int, default 10): - Sepecifies the number of nearest neighbors to return. Default to 10. - distance_type (str, defalt "euclidean"): - Specifies the type of metric to use to compute the distance between two vectors. - Possible values are "euclidean" and "cosine". Default to "euclidean". - fraction_lists_to_search (float, range in [0.0, 1.0]): - Specifies the percentage of lists to search. Specifying a higher percentage leads to - higher recall and slower performance, and the converse is true when specifying a lower - percentage. It is only used when a vector index is also used. You can only specify - ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. - use_brute_force (bool, default False): - Determines whether to use brute force search by skipping the vector index if one is available. - Default to False. - - Returns: - bigframes.dataframe.DataFrame: A DataFrame containing vector search result. - """ - if not fraction_lists_to_search and use_brute_force is True: - raise ValueError( - "You can't specify fraction_lists_to_search when use_brute_force is set to True." - ) - if ( - isinstance(query, bigframes.series.Series) - and query_column_to_search is not None - ): - raise ValueError( - "You can't specify query_column_to_search when query is a Series." - ) - # TODO(ashleyxu): Support options in vector search. b/344019989 - if fraction_lists_to_search is not None or use_brute_force is True: - raise NotImplementedError( - f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" - ) - options = { - "base_table": base_table, - "column_to_search": column_to_search, - "query_column_to_search": query_column_to_search, - "distance_type": distance_type, - "top_k": top_k, - "fraction_lists_to_search": fraction_lists_to_search, - "use_brute_force": use_brute_force, - } - - (query,) = utils.convert_to_dataframe(query) - sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) - - sql = bigframes.core.sql.create_vector_search_sql( - sql_string=sql_string, options=options # type: ignore - ) - if index_col_ids is not None: - df = query._session.read_gbq(sql, index_col=index_col_ids) - else: - df = query._session.read_gbq(sql) - df.index.names = index_labels - - return df +from bigframes.bigquery._operations.array import array_length, array_agg, array_to_string +from bigframes.bigquery._operations.approx_agg import approx_top_count +from bigframes.bigquery._operations.json import json_set, json_extract, json_extract_array +from bigframes.bigquery._operations.search import create_vector_index, vector_search +from bigframes.bigquery._operations.struct import struct + + +__all__ = [ + "array_length", + "array_agg", + "array_to_string", + "json_set", + "json_extract", + "json_extract_array", + "approx_top_count", + "struct", + "create_vector_index", + "vector_search", +] diff --git a/bigframes/bigquery/_operations/__init__.py b/bigframes/bigquery/_operations/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/bigframes/bigquery/_operations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py new file mode 100644 index 0000000000..7717136bf8 --- /dev/null +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -0,0 +1,70 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import bigframes_vendored.constants as constants + +import bigframes.core.groupby as groupby +import bigframes.core.sql +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series + + +""" +Approximate functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions +""" + +def approx_top_count( + series: series.Series, + number: int, +) -> series.Series: + """Returns the approximate top elements of `expression` as an array of STRUCTs. + The number parameter specifies the number of elements returned. + + Each `STRUCT` contains two fields. The first field (named `value`) contains an input + value. The second field (named `count`) contains an `INT64` specifying the number + of times the value was returned. + + Returns `NULL` if there are zero input rows. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) + >>> bbq.approx_top_count(s, number=2) + [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] + + Args: + series (bigframes.series.Series): + The Series with any data type that the `GROUP BY` clause supports. + number (int): + An integer specifying the number of times the value was returned. + + Returns: + bigframes.series.Series: A new Series with the result data. + """ + if number < 1: + raise ValueError("The number of approx_top_count must be at least 1") + return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) + diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py new file mode 100644 index 0000000000..e43063af7d --- /dev/null +++ b/bigframes/bigquery/_operations/array.py @@ -0,0 +1,150 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Array functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions +""" + + +from __future__ import annotations + +import typing + +import bigframes_vendored.constants as constants + +import bigframes.core.groupby as groupby +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series + +def array_length(series: series.Series) -> series.Series: + """Compute the length of each array element in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) + >>> bbq.array_length(s) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + You can also apply this function directly to Series. + + >>> s.apply(bbq.array_length, by_row=False) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + Args: + series (bigframes.series.Series): A Series with array columns. + + Returns: + bigframes.series.Series: A Series of integer values indicating + the length of each element in the Series. + + """ + return series._apply_unary_op(ops.len_op) + + +def array_agg( + obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, +) -> series.Series | dataframe.DataFrame: + """Group data and create arrays from selected columns, omitting NULLs to avoid + BigQuery errors (NULLs not allowed in arrays). + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + For a SeriesGroupBy object: + + >>> lst = ['a', 'a', 'b', 'b', 'a'] + >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst) + >>> bbq.array_agg(s.groupby(level=0)) + a [1. 2.] + b [3. 4.] + dtype: list[pyarrow] + + For a DataFrameGroupBy object: + + >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = bpd.DataFrame(l, columns=["a", "b", "c"]) + >>> bbq.array_agg(df.groupby(by=["b"])) + a c + b + 1.0 [2] [3] + 2.0 [1 1] [3 2] + + [2 rows x 2 columns] + + Args: + obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): + A GroupBy object to be applied the function. + + Returns: + bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or + DataFrame containing aggregated array columns, and indexed by the + original group columns. + """ + if isinstance(obj, groupby.SeriesGroupBy): + return obj._aggregate(agg_ops.ArrayAggOp()) + elif isinstance(obj, groupby.DataFrameGroupBy): + return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False) + else: + raise ValueError( + f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" + ) + + +def array_to_string(series: series.Series, delimiter: str) -> series.Series: + """Converts array elements within a Series into delimited strings. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) + >>> bbq.array_to_string(s, delimiter=", ") + 0 H, i, ! + 1 Hello, World + 2 + 3 + 4 Hi + dtype: string + + Args: + series (bigframes.series.Series): A Series containing arrays. + delimiter (str): The string used to separate array elements. + + Returns: + bigframes.series.Series: A Series containing delimited strings. + + """ + return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) \ No newline at end of file diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py new file mode 100644 index 0000000000..197406e07a --- /dev/null +++ b/bigframes/bigquery/_operations/json.py @@ -0,0 +1,145 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +JSON functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions +""" + + +from __future__ import annotations + +import typing + +import bigframes_vendored.constants as constants + +import bigframes.core.groupby as groupby +import bigframes.core.sql +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series + + +def json_set( + series: series.Series, + json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], +) -> series.Series: + """Produces a new JSON value within a Series by inserting or replacing values at + specified paths. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] + >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) + 0 {"a":100,"b":"hi"} + Name: data, dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path_value_pairs (Sequence[Tuple[str, typing.Any]]): + Pairs of JSON path and the new value to insert/replace. + + Returns: + bigframes.series.Series: A new Series with the transformed JSON data. + + """ + # SQLGlot parser does not support the "create_if_missing => true" syntax, so + # create_if_missing is not currently implemented. + + for json_path_value_pair in json_path_value_pairs: + if len(json_path_value_pair) != 2: + raise ValueError( + "Incorrect format: Expected (, ), but found: " + + f"{json_path_value_pair}" + ) + + json_path, json_value = json_path_value_pair + series = series._apply_binary_op( + json_value, ops.JSONSet(json_path=json_path), alignment="left" + ) + return series + + +def json_extract( + series: series.Series, + json_path: str, +) -> series.Series: + """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` + value. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) + >>> bbq.json_extract(s, json_path="$.class") + 0 {"students":[{"id":5},{"id":12}]} + dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) + + +def json_extract_array( + series: series.Series, + json_path: str = "$", +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON` + values. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_extract_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) + diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py new file mode 100644 index 0000000000..640dc44791 --- /dev/null +++ b/bigframes/bigquery/_operations/search.py @@ -0,0 +1,244 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +import typing +from typing import Collection, Literal, Mapping, Optional, Union + +import bigframes_vendored.constants as constants + +import bigframes.core.sql +import bigframes.ml.utils as utils + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series + import bigframes.session + +""" +Search functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions +""" + + +def create_vector_index( + table_id: str, + column_name: str, + *, + replace: bool = False, + index_name: Optional[str]= None, + distance_type="cosine", + stored_column_names: Collection[str] = (), + index_type: str = "ivf", + ivf_options: Optional[Mapping] = None, + tree_ah_options: Optional[Mapping] = None, + session: Optional[bigframes.session.Session] = None, +) -> None: + """ + Creates a new vector index on a column of a table. + + This method calls the `CREATE VECTOR INDEX DDL statement + `_. + + """ + import bigframes.pandas + + if index_name is None: + index_name = table_id.split(".")[-1] + + options = { + "index_type": index_type.upper(), + "distance_type": distance_type.upper(), + } + + if ivf_options is not None: + options["ivf_options"] = json.dumps(ivf_options) + + if tree_ah_options is not None: + options["tree_ah_options"] = json.dumps(tree_ah_options) + + sql = bigframes.core.sql.create_vector_index_ddl( + replace=replace, + index_name=index_name, + table_name=table_id, + column_name=column_name, + stored_column_names=stored_column_names, + options=options, + ) + + # Use global read_gbq to execute this for better location autodetection. + if session is None: + read_gbq_query = bigframes.pandas.read_gbq_query + else: + read_gbq_query = session.read_gbq_query + + read_gbq_query(sql) + + + +def vector_search( + base_table: str, + column_to_search: str, + query: Union[dataframe.DataFrame, series.Series], + *, + query_column_to_search: Optional[str] = None, + top_k: Optional[int] = 10, + distance_type: Literal["euclidean", "cosine"] = "euclidean", + fraction_lists_to_search: Optional[float] = None, + use_brute_force: bool = False, +) -> dataframe.DataFrame: + """ + Conduct vector search which searches embeddings to find semantically similar entities. + + This method calls the `VECTOR_SEARCH() SQL function + `_. + + **Examples:** + + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column + is used as the search query: + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + query_id embedding id my_embedding distance + 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 + 0 dog [1. 2.] 1 [1. 2.] 0.0 + 0 dog [1. 2.] 4 [1. 3.2] 1.2 + 1 cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 5 columns] + + Series embeddings for which to find nearest neighbors: + + >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], + ... index=["dog", "cat"], + ... name="embedding") + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + embedding id my_embedding distance + dog [1. 2.] 1 [1. 2.] 0.0 + cat [3. 5.2] 5 [5. 5.4] 2.009975 + dog [1. 2.] 4 [1. 3.2] 1.2 + cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 4 columns] + + You can specify the name of the column in the query DataFrame embeddings and distance type. + If you specify query_column_to_search_value, it will use the provided column which contains + the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]], + ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... distance_type="cosine", + ... query_column_to_search="another_embedding", + ... top_k=2) + query_id embedding another_embedding id my_embedding distance + 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 + 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 + + [4 rows x 6 columns] + + Args: + base_table (str): + The table to search for nearest neighbor embeddings. + column_to_search (str): + The name of the base table column to search for nearest neighbor embeddings. + The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. + query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): + A Series or DataFrame that provides the embeddings for which to find nearest neighbors. + query_column_to_search (str): + Specifies the name of the column in the query that contains the embeddings for which to + find nearest neighbors. The column must have a type of ``ARRAY``. All elements in + the array must be non-NULL and all values in the column must have the same array dimensions + as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. + top_k (int, default 10): + Sepecifies the number of nearest neighbors to return. Default to 10. + distance_type (str, defalt "euclidean"): + Specifies the type of metric to use to compute the distance between two vectors. + Possible values are "euclidean" and "cosine". Default to "euclidean". + fraction_lists_to_search (float, range in [0.0, 1.0]): + Specifies the percentage of lists to search. Specifying a higher percentage leads to + higher recall and slower performance, and the converse is true when specifying a lower + percentage. It is only used when a vector index is also used. You can only specify + ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. + use_brute_force (bool, default False): + Determines whether to use brute force search by skipping the vector index if one is available. + Default to False. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame containing vector search result. + """ + import bigframes.series + + if not fraction_lists_to_search and use_brute_force is True: + raise ValueError( + "You can't specify fraction_lists_to_search when use_brute_force is set to True." + ) + if ( + isinstance(query, bigframes.series.Series) + and query_column_to_search is not None + ): + raise ValueError( + "You can't specify query_column_to_search when query is a Series." + ) + # TODO(ashleyxu): Support options in vector search. b/344019989 + if fraction_lists_to_search is not None or use_brute_force is True: + raise NotImplementedError( + f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" + ) + options = { + "base_table": base_table, + "column_to_search": column_to_search, + "query_column_to_search": query_column_to_search, + "distance_type": distance_type, + "top_k": top_k, + "fraction_lists_to_search": fraction_lists_to_search, + "use_brute_force": use_brute_force, + } + + (query,) = utils.convert_to_dataframe(query) + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) + + sql = bigframes.core.sql.create_vector_search_sql( + sql_string=sql_string, options=options # type: ignore + ) + if index_col_ids is not None: + df = query._session.read_gbq(sql, index_col=index_col_ids) + else: + df = query._session.read_gbq(sql) + df.index.names = index_labels + + return df \ No newline at end of file diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py new file mode 100644 index 0000000000..1c77befc8e --- /dev/null +++ b/bigframes/bigquery/_operations/struct.py @@ -0,0 +1,70 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""This module integrates BigQuery built-in functions for use with DataFrame objects, +such as array functions: +https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ + + +from __future__ import annotations + +import typing + +import bigframes_vendored.constants as constants + +import bigframes.core.groupby as groupby +import bigframes.core.sql +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series + + +def struct(value: dataframe.DataFrame) -> series.Series: + """Takes a DataFrame and converts it into a Series of structs with each + struct entry corresponding to a DataFrame row and each struct field + corresponding to a DataFrame column + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import bigframes.series as series + >>> bpd.options.display.progress_bar = None + + >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) + >>> df = srs.struct.explode() + >>> bbq.struct(df) + 0 {'project': 'pandas', 'version': 1} + 1 {'project': 'numpy', 'version': 2} + dtype: struct[pyarrow] + + Args: + value (bigframes.dataframe.DataFrame): + The DataFrame to be converted to a Series of structs + + Returns: + bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame + """ + import bigframes.series + + block = value._block + block, result_id = block.apply_nary_op( + block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) + ) + block = block.select_column(result_id) + return bigframes.series.Series(block) \ No newline at end of file diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 528c9bcc74..a90890d2b4 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -19,7 +19,7 @@ import datetime import math -from typing import Iterable, Mapping, TYPE_CHECKING, Union +from typing import Collection, Iterable, Mapping, TYPE_CHECKING, Union import bigframes.core.compile.googlesql as googlesql @@ -118,6 +118,43 @@ def ordering_clause( return f"ORDER BY {' ,'.join(parts)}" +def create_vector_index_ddl( + *, + replace: bool, + index_name: str, + table_name: str, + column_name: str, + stored_column_names: Collection[str], + options: Mapping[str, Union[str | int | bool | float]] = {}, +) -> str: + """Encode the VECTOR INDEX statement for BigQuery Vector Search.""" + + if replace: + create = "CREATE OR REPLACE VECTOR INDEX " + else: + create = "CREATE VECTOR INDEX IF NOT EXISTS " + + if len(stored_column_names) > 0: + escaped_stored = [f"`{name}`" for name in stored_column_names] + storing = f"STORING({', '.join(escaped_stored)}) " + else: + storing = "" + + options = ", ".join([ + f"{option_name} = {simple_literal(option_value)}" + for option_name, option_value + in options.items() + ]) + + return f""" + {create} {index_name} + ON `{table_name}`(`{column_name}`) + {storing} + OPTIONS({options}); + """ + + + def create_vector_search_sql( sql_string: str, options: Mapping[str, Union[str | int | bool | float]] = {}, diff --git a/tests/system/conftest.py b/tests/system/conftest.py index d9246eecfb..840b9fcfa7 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -240,6 +240,11 @@ def dataset_id_permanent_tokyo( return dataset_id +@pytest.fixture(scope="session") +def table_id_not_created(dataset_id: str): + return f"{dataset_id}.{prefixer.create_prefix()}" + + @pytest.fixture(scope="session") def scalars_schema(bigquery_client: bigquery.Client): # TODO(swast): Add missing scalar data types such as BIGNUMERIC. diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 4280c0a888..bbe302f0de 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -12,13 +12,89 @@ # See the License for the specific language governing permissions and # limitations under the License. +import random + +import google.cloud.bigquery import numpy as np import pandas as pd +import pyarrow +import pytest import bigframes.bigquery as bbq import bigframes.pandas as bpd +# Need at least 5,000 rows to create a vector index. +VECTOR_DF = pd.DataFrame( + { + "rowid": np.arange(9_999), + # 3D values, clustered around the three unit vector axes. + "my_embedding": pd.Series( + [ + [ + 1 + (random.random() - 0.5) if (row % 3) == 0 else 0, + 1 + (random.random() - 0.5) if (row % 3) == 1 else 0, + 1 + (random.random() - 0.5) if (row % 3) == 2 else 0, + ] + for row in range(9_999) + ], + dtype=pd.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), + # Three groups of animal, vegetable, and mineral, corresponding to + # the embeddings above. + "mystery_word": [ + "aarvark", "broccoli", "calcium", + "dog", "eggplant", "ferrite", + "gopher", "huckleberry", "ice", + ] * 1_111, + }, +) + + +@pytest.fixture +def vector_table_id( + bigquery_client: google.cloud.bigquery.Client, + # Use non-US location to ensure location autodetection works. + table_id_not_created: str, +): + bigquery_client.load_table_from_dataframe(VECTOR_DF, table_id_not_created).result() + yield table_id_not_created + bigquery_client.delete_table(table_id_not_created, not_found_ok=True) + + +def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client): + bbq.create_vector_index( + vector_table_id, + "my_embedding", + distance_type="cosine", + stored_column_names=["mystery_word"], + index_type="ivf", + ivf_options={"num_lists": 3}, + session=session, + ) + + # Check that the index was created successfully. + project_id, dataset_id, table_name = vector_table_id.split(".") + indexes = bigquery_client.query_and_wait( + f""" + SELECT index_catalog, index_schema, table_name, index_name, index_column_name + FROM `{project_id}`.`{dataset_id}`.INFORMATION_SCHEMA.VECTOR_INDEX_COLUMNS + WHERE table_name = '{table_name}'; + """ + ).to_dataframe() + + # There should only be one vector index. + assert len(indexes.index) == 1 + assert indexes["index_catalog"].iloc[0] == project_id + assert indexes["index_schema"].iloc[0] == dataset_id + assert indexes["table_name"].iloc[0] == table_name + assert indexes["index_column_name"].iloc[0] == "my_embedding" + + # If no name is specified, use the table name as the index name + assert indexes["index_name"].iloc[0] == table_name + + + def test_vector_search_basic_params_with_df(): search_query = bpd.DataFrame( { From 6e537d1f92082e6a50d15e72c66f2ca387042be0 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 26 Sep 2024 20:56:56 +0000 Subject: [PATCH 02/10] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/bigquery/__init__.py | 13 +++++++++--- bigframes/bigquery/_operations/approx_agg.py | 2 +- bigframes/bigquery/_operations/array.py | 3 ++- bigframes/bigquery/_operations/json.py | 1 - bigframes/bigquery/_operations/search.py | 13 ++++++------ bigframes/bigquery/_operations/struct.py | 4 ++-- bigframes/core/sql.py | 14 ++++++------- .../small/bigquery/test_vector_search.py | 21 ++++++++++++------- 8 files changed, 42 insertions(+), 29 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 9953a946d8..0b2d2d5aeb 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -16,13 +16,20 @@ such as array functions: https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ -from bigframes.bigquery._operations.array import array_length, array_agg, array_to_string from bigframes.bigquery._operations.approx_agg import approx_top_count -from bigframes.bigquery._operations.json import json_set, json_extract, json_extract_array +from bigframes.bigquery._operations.array import ( + array_agg, + array_length, + array_to_string, +) +from bigframes.bigquery._operations.json import ( + json_extract, + json_extract_array, + json_set, +) from bigframes.bigquery._operations.search import create_vector_index, vector_search from bigframes.bigquery._operations.struct import struct - __all__ = [ "array_length", "array_agg", diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 7717136bf8..744893ef64 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -33,6 +33,7 @@ https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions """ + def approx_top_count( series: series.Series, number: int, @@ -67,4 +68,3 @@ def approx_top_count( if number < 1: raise ValueError("The number of approx_top_count must be at least 1") return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) - diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index e43063af7d..afb5c93bbc 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -32,6 +32,7 @@ import bigframes.dataframe as dataframe import bigframes.series as series + def array_length(series: series.Series) -> series.Series: """Compute the length of each array element in the Series. @@ -147,4 +148,4 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: bigframes.series.Series: A Series containing delimited strings. """ - return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) \ No newline at end of file + return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 197406e07a..dc3efc0200 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -142,4 +142,3 @@ def json_extract_array( bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. """ return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) - diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index 640dc44791..392538d9da 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -39,7 +39,7 @@ def create_vector_index( column_name: str, *, replace: bool = False, - index_name: Optional[str]= None, + index_name: Optional[str] = None, distance_type="cosine", stored_column_names: Collection[str] = (), index_type: str = "ivf", @@ -52,21 +52,21 @@ def create_vector_index( This method calls the `CREATE VECTOR INDEX DDL statement `_. - + """ import bigframes.pandas if index_name is None: index_name = table_id.split(".")[-1] - + options = { "index_type": index_type.upper(), "distance_type": distance_type.upper(), } - + if ivf_options is not None: options["ivf_options"] = json.dumps(ivf_options) - + if tree_ah_options is not None: options["tree_ah_options"] = json.dumps(tree_ah_options) @@ -86,7 +86,6 @@ def create_vector_index( read_gbq_query = session.read_gbq_query read_gbq_query(sql) - def vector_search( @@ -241,4 +240,4 @@ def vector_search( df = query._session.read_gbq(sql) df.index.names = index_labels - return df \ No newline at end of file + return df diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index 1c77befc8e..bb8082458e 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -61,10 +61,10 @@ def struct(value: dataframe.DataFrame) -> series.Series: bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame """ import bigframes.series - + block = value._block block, result_id = block.apply_nary_op( block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) ) block = block.select_column(result_id) - return bigframes.series.Series(block) \ No newline at end of file + return bigframes.series.Series(block) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index a90890d2b4..0456474b41 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -139,12 +139,13 @@ def create_vector_index_ddl( storing = f"STORING({', '.join(escaped_stored)}) " else: storing = "" - - options = ", ".join([ - f"{option_name} = {simple_literal(option_value)}" - for option_name, option_value - in options.items() - ]) + + options = ", ".join( + [ + f"{option_name} = {simple_literal(option_value)}" + for option_name, option_value in options.items() + ] + ) return f""" {create} {index_name} @@ -154,7 +155,6 @@ def create_vector_index_ddl( """ - def create_vector_search_sql( sql_string: str, options: Mapping[str, Union[str | int | bool | float]] = {}, diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index bbe302f0de..6a259779d8 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -23,7 +23,6 @@ import bigframes.bigquery as bbq import bigframes.pandas as bpd - # Need at least 5,000 rows to create a vector index. VECTOR_DF = pd.DataFrame( { @@ -43,10 +42,17 @@ # Three groups of animal, vegetable, and mineral, corresponding to # the embeddings above. "mystery_word": [ - "aarvark", "broccoli", "calcium", - "dog", "eggplant", "ferrite", - "gopher", "huckleberry", "ice", - ] * 1_111, + "aarvark", + "broccoli", + "calcium", + "dog", + "eggplant", + "ferrite", + "gopher", + "huckleberry", + "ice", + ] + * 1_111, }, ) @@ -62,7 +68,9 @@ def vector_table_id( bigquery_client.delete_table(table_id_not_created, not_found_ok=True) -def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client): +def test_create_vector_index_ivf( + session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client +): bbq.create_vector_index( vector_table_id, "my_embedding", @@ -94,7 +102,6 @@ def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: assert indexes["index_name"].iloc[0] == table_name - def test_vector_search_basic_params_with_df(): search_query = bpd.DataFrame( { From 0efaf7e0071aad404b7ab1ae73c696fcef0655f6 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 14 Oct 2024 11:08:49 -0500 Subject: [PATCH 03/10] fix lint errors --- bigframes/bigquery/__init__.py | 13 +++++++++--- bigframes/bigquery/_operations/approx_agg.py | 15 ++----------- bigframes/bigquery/_operations/array.py | 3 ++- bigframes/bigquery/_operations/json.py | 15 +++---------- bigframes/bigquery/_operations/search.py | 13 ++++++------ bigframes/bigquery/_operations/struct.py | 11 ++-------- bigframes/core/sql.py | 14 ++++++------- .../small/bigquery/test_vector_search.py | 21 ++++++++++++------- 8 files changed, 46 insertions(+), 59 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 9953a946d8..0b2d2d5aeb 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -16,13 +16,20 @@ such as array functions: https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ -from bigframes.bigquery._operations.array import array_length, array_agg, array_to_string from bigframes.bigquery._operations.approx_agg import approx_top_count -from bigframes.bigquery._operations.json import json_set, json_extract, json_extract_array +from bigframes.bigquery._operations.array import ( + array_agg, + array_length, + array_to_string, +) +from bigframes.bigquery._operations.json import ( + json_extract, + json_extract_array, + json_set, +) from bigframes.bigquery._operations.search import create_vector_index, vector_search from bigframes.bigquery._operations.struct import struct - __all__ = [ "array_length", "array_agg", diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py index 7717136bf8..696f8f5a66 100644 --- a/bigframes/bigquery/_operations/approx_agg.py +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -14,25 +14,15 @@ from __future__ import annotations -import typing - -import bigframes_vendored.constants as constants - -import bigframes.core.groupby as groupby -import bigframes.core.sql -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops - -if typing.TYPE_CHECKING: - import bigframes.dataframe as dataframe - import bigframes.series as series - +import bigframes.series as series """ Approximate functions defined from https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions """ + def approx_top_count( series: series.Series, number: int, @@ -67,4 +57,3 @@ def approx_top_count( if number < 1: raise ValueError("The number of approx_top_count must be at least 1") return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) - diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 31db56b832..4af1416127 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -32,6 +32,7 @@ if typing.TYPE_CHECKING: import bigframes.dataframe as dataframe + def array_length(series: series.Series) -> series.Series: """Compute the length of each array element in the Series. @@ -147,4 +148,4 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: bigframes.series.Series: A Series containing delimited strings. """ - return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) \ No newline at end of file + return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 6f676ed9a5..d3c3c97a9c 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -21,23 +21,15 @@ from __future__ import annotations -import typing +from typing import Any, Sequence, Tuple -import bigframes_vendored.constants as constants - -import bigframes.core.groupby as groupby -import bigframes.core.sql import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops import bigframes.series as series -if typing.TYPE_CHECKING: - import bigframes.dataframe as dataframe - def json_set( series: series.Series, - json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], + json_path_value_pairs: Sequence[Tuple[str, Any]], ) -> series.Series: """Produces a new JSON value within a Series by inserting or replacing values at specified paths. @@ -57,7 +49,7 @@ def json_set( Args: series (bigframes.series.Series): The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path_value_pairs (Sequence[Tuple[str, typing.Any]]): + json_path_value_pairs (Sequence[Tuple[str, Any]]): Pairs of JSON path and the new value to insert/replace. Returns: @@ -142,4 +134,3 @@ def json_extract_array( bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. """ return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) - diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index 47fce80492..ea5ff13630 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -39,7 +39,7 @@ def create_vector_index( column_name: str, *, replace: bool = False, - index_name: Optional[str]= None, + index_name: Optional[str] = None, distance_type="cosine", stored_column_names: Collection[str] = (), index_type: str = "ivf", @@ -52,21 +52,21 @@ def create_vector_index( This method calls the `CREATE VECTOR INDEX DDL statement `_. - + """ import bigframes.pandas if index_name is None: index_name = table_id.split(".")[-1] - + options = { "index_type": index_type.upper(), "distance_type": distance_type.upper(), } - + if ivf_options is not None: options["ivf_options"] = json.dumps(ivf_options) - + if tree_ah_options is not None: options["tree_ah_options"] = json.dumps(tree_ah_options) @@ -86,7 +86,6 @@ def create_vector_index( read_gbq_query = session.read_gbq_query read_gbq_query(sql) - def vector_search( @@ -241,4 +240,4 @@ def vector_search( else: df = query._session.read_gbq(sql) - return df \ No newline at end of file + return df diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py index 1c77befc8e..7cb826351c 100644 --- a/bigframes/bigquery/_operations/struct.py +++ b/bigframes/bigquery/_operations/struct.py @@ -22,16 +22,11 @@ import typing -import bigframes_vendored.constants as constants - -import bigframes.core.groupby as groupby -import bigframes.core.sql import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops +import bigframes.series as series if typing.TYPE_CHECKING: import bigframes.dataframe as dataframe - import bigframes.series as series def struct(value: dataframe.DataFrame) -> series.Series: @@ -60,11 +55,9 @@ def struct(value: dataframe.DataFrame) -> series.Series: Returns: bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame """ - import bigframes.series - block = value._block block, result_id = block.apply_nary_op( block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) ) block = block.select_column(result_id) - return bigframes.series.Series(block) \ No newline at end of file + return series.Series(block) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 268f52b23a..fac74ec06c 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -137,12 +137,13 @@ def create_vector_index_ddl( storing = f"STORING({', '.join(escaped_stored)}) " else: storing = "" - - options = ", ".join([ - f"{option_name} = {simple_literal(option_value)}" - for option_name, option_value - in options.items() - ]) + + options = ", ".join( + [ + f"{option_name} = {simple_literal(option_value)}" + for option_name, option_value in options.items() + ] + ) return f""" {create} {index_name} @@ -152,7 +153,6 @@ def create_vector_index_ddl( """ - def create_vector_search_sql( sql_string: str, options: Mapping[str, Union[str | int | bool | float]] = {}, diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index bbe302f0de..6a259779d8 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -23,7 +23,6 @@ import bigframes.bigquery as bbq import bigframes.pandas as bpd - # Need at least 5,000 rows to create a vector index. VECTOR_DF = pd.DataFrame( { @@ -43,10 +42,17 @@ # Three groups of animal, vegetable, and mineral, corresponding to # the embeddings above. "mystery_word": [ - "aarvark", "broccoli", "calcium", - "dog", "eggplant", "ferrite", - "gopher", "huckleberry", "ice", - ] * 1_111, + "aarvark", + "broccoli", + "calcium", + "dog", + "eggplant", + "ferrite", + "gopher", + "huckleberry", + "ice", + ] + * 1_111, }, ) @@ -62,7 +68,9 @@ def vector_table_id( bigquery_client.delete_table(table_id_not_created, not_found_ok=True) -def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client): +def test_create_vector_index_ivf( + session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client +): bbq.create_vector_index( vector_table_id, "my_embedding", @@ -94,7 +102,6 @@ def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: assert indexes["index_name"].iloc[0] == table_name - def test_vector_search_basic_params_with_df(): search_query = bpd.DataFrame( { From 7c4167700753a63cfa50088df3046a0cd8d9f5f7 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 14 Oct 2024 11:10:32 -0500 Subject: [PATCH 04/10] format --- bigframes/bigquery/_operations/array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py index 3c1d7239e4..4af1416127 100644 --- a/bigframes/bigquery/_operations/array.py +++ b/bigframes/bigquery/_operations/array.py @@ -33,7 +33,6 @@ import bigframes.dataframe as dataframe - def array_length(series: series.Series) -> series.Series: """Compute the length of each array element in the Series. From 7ac22fa7fc15fa443cf6c18ef23c503d002d832e Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 15 Oct 2024 10:21:33 -0500 Subject: [PATCH 05/10] fix mypy --- bigframes/core/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index fac74ec06c..25d5c1dd41 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -138,7 +138,7 @@ def create_vector_index_ddl( else: storing = "" - options = ", ".join( + rendered_options = ", ".join( [ f"{option_name} = {simple_literal(option_value)}" for option_name, option_value in options.items() @@ -149,7 +149,7 @@ def create_vector_index_ddl( {create} {index_name} ON `{table_name}`(`{column_name}`) {storing} - OPTIONS({options}); + OPTIONS({rendered_options}); """ From e704cdae9f6d7c4cd9abe893d1a45bf45971953f Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 15 Oct 2024 12:27:43 -0500 Subject: [PATCH 06/10] fix test for older google-cloud-bigquery --- tests/system/small/bigquery/test_vector_search.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 6a259779d8..3c3526319c 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -63,7 +63,19 @@ def vector_table_id( # Use non-US location to ensure location autodetection works. table_id_not_created: str, ): - bigquery_client.load_table_from_dataframe(VECTOR_DF, table_id_not_created).result() + table = google.cloud.bigquery.Table( + table_id_not_created, + [ + {"name": "rowid", "type": "INT64"}, + {"name": "my_embedding", "type": "FLOAT64", "mode": "REPEATED"}, + {"name": "mystery_word", "type": "STRING"}, + ], + ) + bigquery_client.create_table(table) + bigquery_client.load_table_from_json( + VECTOR_DF.to_dict(orient="records"), + table_id_not_created, + ) yield table_id_not_created bigquery_client.delete_table(table_id_not_created, not_found_ok=True) From 53e23438ab28569d610b9b66b08ce37022e01b62 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 15 Oct 2024 12:35:43 -0500 Subject: [PATCH 07/10] fix type error --- tests/system/small/bigquery/test_vector_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 3c3526319c..1154f777fd 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -13,6 +13,7 @@ # limitations under the License. import random +from typing import Any, cast, Iterable, Mapping import google.cloud.bigquery import numpy as np @@ -73,7 +74,7 @@ def vector_table_id( ) bigquery_client.create_table(table) bigquery_client.load_table_from_json( - VECTOR_DF.to_dict(orient="records"), + cast(Iterable[Mapping[str, Any]], VECTOR_DF.to_dict(orient="records")), table_id_not_created, ) yield table_id_not_created From 77ad1a384b15865fa6e5922379b3aa79eacde9ef Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 15 Oct 2024 12:52:27 -0500 Subject: [PATCH 08/10] fix typing --- tests/system/small/bigquery/test_vector_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 1154f777fd..06eb8fd680 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -13,7 +13,7 @@ # limitations under the License. import random -from typing import Any, cast, Iterable, Mapping +from typing import Any, cast, Dict, Iterable import google.cloud.bigquery import numpy as np @@ -74,7 +74,7 @@ def vector_table_id( ) bigquery_client.create_table(table) bigquery_client.load_table_from_json( - cast(Iterable[Mapping[str, Any]], VECTOR_DF.to_dict(orient="records")), + cast(Iterable[Dict[str, Any]], VECTOR_DF.to_dict(orient="records")), table_id_not_created, ) yield table_id_not_created From 5cc3d468db58a3c7d610006f726cadd36ae1cc08 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 16 Oct 2024 08:35:54 -0500 Subject: [PATCH 09/10] use googlesql.identifier to escape column and table ids --- bigframes/bigquery/_operations/search.py | 4 +++- bigframes/core/sql.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index ea5ff13630..496e259944 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -19,6 +19,7 @@ from typing import Collection, Literal, Mapping, Optional, Union import bigframes_vendored.constants as constants +import google.cloud.bigquery as bigquery import bigframes.core.sql import bigframes.ml.utils as utils @@ -57,7 +58,8 @@ def create_vector_index( import bigframes.pandas if index_name is None: - index_name = table_id.split(".")[-1] + table_ref = bigquery.TableReference.from_string(table_id) + index_name = table_ref.table_id options = { "index_type": index_type.upper(), diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 25d5c1dd41..d5dfc64ddd 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -19,7 +19,7 @@ import datetime import math -from typing import Collection, Iterable, Mapping, TYPE_CHECKING, Union +from typing import cast, Collection, Iterable, Mapping, TYPE_CHECKING, Union import bigframes.core.compile.googlesql as googlesql @@ -133,7 +133,9 @@ def create_vector_index_ddl( create = "CREATE VECTOR INDEX IF NOT EXISTS " if len(stored_column_names) > 0: - escaped_stored = [f"`{name}`" for name in stored_column_names] + escaped_stored = [ + f"{googlesql.identifier(name)}" for name in stored_column_names + ] storing = f"STORING({', '.join(escaped_stored)}) " else: storing = "" @@ -146,8 +148,8 @@ def create_vector_index_ddl( ) return f""" - {create} {index_name} - ON `{table_name}`(`{column_name}`) + {create} {googlesql.identifier(index_name)} + ON {googlesql.identifier(table_name)}({googlesql.identifier(column_name)}) {storing} OPTIONS({rendered_options}); """ @@ -172,7 +174,7 @@ def create_vector_search_sql( base.*, distance, FROM VECTOR_SEARCH( - TABLE `{base_table}`, + TABLE {googlesql.identifier(cast(str, base_table))}, {simple_literal(column_to_search)}, ({sql_string}), {simple_literal(query_column_to_search)}, @@ -187,7 +189,7 @@ def create_vector_search_sql( base.*, distance, FROM VECTOR_SEARCH( - TABLE `{base_table}`, + TABLE {googlesql.identifier(cast(str, base_table))}, {simple_literal(column_to_search)}, ({sql_string}), distance_type => {simple_literal(distance_type)}, From 13badf861ac32162ffbbfcf2850f5957aa8f1817 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 17 Oct 2024 16:34:14 -0500 Subject: [PATCH 10/10] wait for job to finish --- tests/system/small/bigquery/test_vector_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 06eb8fd680..b6a6d59c4c 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -76,7 +76,7 @@ def vector_table_id( bigquery_client.load_table_from_json( cast(Iterable[Dict[str, Any]], VECTOR_DF.to_dict(orient="records")), table_id_not_created, - ) + ).result() yield table_id_not_created bigquery_client.delete_table(table_id_not_created, not_found_ok=True)