From db8cf2d234869e7e3dcf1dd2a77afccdb0a28a39 Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Thu, 26 Sep 2024 15:51:21 -0500
Subject: [PATCH 01/10] feat: add `bigframes.bigquery.create_vector_index` to
 assist in creating vector index on `ARRAY<FLOAT64>` columns

---
 bigframes/bigquery/__init__.py                | 501 +-----------------
 bigframes/bigquery/_operations/__init__.py    |  13 +
 bigframes/bigquery/_operations/approx_agg.py  |  70 +++
 bigframes/bigquery/_operations/array.py       | 150 ++++++
 bigframes/bigquery/_operations/json.py        | 145 +++++
 bigframes/bigquery/_operations/search.py      | 244 +++++++++
 bigframes/bigquery/_operations/struct.py      |  70 +++
 bigframes/core/sql.py                         |  39 +-
 tests/system/conftest.py                      |   5 +
 .../small/bigquery/test_vector_search.py      |  76 +++
 10 files changed, 830 insertions(+), 483 deletions(-)
 create mode 100644 bigframes/bigquery/_operations/__init__.py
 create mode 100644 bigframes/bigquery/_operations/approx_agg.py
 create mode 100644 bigframes/bigquery/_operations/array.py
 create mode 100644 bigframes/bigquery/_operations/json.py
 create mode 100644 bigframes/bigquery/_operations/search.py
 create mode 100644 bigframes/bigquery/_operations/struct.py
diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 28a818e709..9953a946d8 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -12,489 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 """This module integrates BigQuery built-in functions for use with DataFrame objects,
 such as array functions:
 https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """
 
-
-from __future__ import annotations
-
-import typing
-from typing import Literal, Optional, Union
-
-import bigframes_vendored.constants as constants
-
-import bigframes.core.groupby as groupby
-import bigframes.core.sql
-import bigframes.ml.utils as utils
-import bigframes.operations as ops
-import bigframes.operations.aggregations as agg_ops
-import bigframes.series
-
-if typing.TYPE_CHECKING:
-    import bigframes.dataframe as dataframe
-    import bigframes.series as series
-
-
-# Array functions defined from
-# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
-
-
-def array_length(series: series.Series) -> series.Series:
-    """Compute the length of each array element in the Series.
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> bpd.options.display.progress_bar = None
-
-        >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]])
-        >>> bbq.array_length(s)
-        0    4
-        1    0
-        2    2
-        dtype: Int64
-
-    You can also apply this function directly to Series.
-
-        >>> s.apply(bbq.array_length, by_row=False)
-        0    4
-        1    0
-        2    2
-        dtype: Int64
-
-    Args:
-        series (bigframes.series.Series): A Series with array columns.
-
-    Returns:
-        bigframes.series.Series: A Series of integer values indicating
-            the length of each element in the Series.
-
-    """
-    return series._apply_unary_op(ops.len_op)
-
-
-def array_agg(
-    obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
-) -> series.Series | dataframe.DataFrame:
-    """Group data and create arrays from selected columns, omitting NULLs to avoid
-    BigQuery errors (NULLs not allowed in arrays).
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> import numpy as np
-        >>> bpd.options.display.progress_bar = None
-
-    For a SeriesGroupBy object:
-
-        >>> lst = ['a', 'a', 'b', 'b', 'a']
-        >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst)
-        >>> bbq.array_agg(s.groupby(level=0))
-        a    [1. 2.]
-        b    [3. 4.]
-        dtype: list<item: double>[pyarrow]
-
-    For a DataFrameGroupBy object:
-
-        >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
-        >>> df = bpd.DataFrame(l, columns=["a", "b", "c"])
-        >>> bbq.array_agg(df.groupby(by=["b"]))
-                 a      c
-        b
-        1.0    [2]    [3]
-        2.0  [1 1]  [3 2]
-        <BLANKLINE>
-        [2 rows x 2 columns]
-
-    Args:
-        obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
-            A GroupBy object to be applied the function.
-
-    Returns:
-        bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
-            DataFrame containing aggregated array columns, and indexed by the
-            original group columns.
-    """
-    if isinstance(obj, groupby.SeriesGroupBy):
-        return obj._aggregate(agg_ops.ArrayAggOp())
-    elif isinstance(obj, groupby.DataFrameGroupBy):
-        return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False)
-    else:
-        raise ValueError(
-            f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
-        )
-
-
-def array_to_string(series: series.Series, delimiter: str) -> series.Series:
-    """Converts array elements within a Series into delimited strings.
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> import numpy as np
-        >>> bpd.options.display.progress_bar = None
-
-        >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
-        >>> bbq.array_to_string(s, delimiter=", ")
-            0         H, i, !
-            1    Hello, World
-            2
-            3
-            4              Hi
-            dtype: string
-
-    Args:
-        series (bigframes.series.Series): A Series containing arrays.
-        delimiter (str): The string used to separate array elements.
-
-    Returns:
-        bigframes.series.Series: A Series containing delimited strings.
-
-    """
-    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
-
-
-# JSON functions defined from
-# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
-
-
-def json_set(
-    series: series.Series,
-    json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]],
-) -> series.Series:
-    """Produces a new JSON value within a Series by inserting or replacing values at
-    specified paths.
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> import numpy as np
-        >>> bpd.options.display.progress_bar = None
-
-        >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
-        >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
-            0    {"a":100,"b":"hi"}
-            Name: data, dtype: string
-
-    Args:
-        series (bigframes.series.Series):
-            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
-        json_path_value_pairs (Sequence[Tuple[str, typing.Any]]):
-            Pairs of JSON path and the new value to insert/replace.
-
-    Returns:
-        bigframes.series.Series: A new Series with the transformed JSON data.
-
-    """
-    # SQLGlot parser does not support the "create_if_missing => true" syntax, so
-    # create_if_missing is not currently implemented.
-
-    for json_path_value_pair in json_path_value_pairs:
-        if len(json_path_value_pair) != 2:
-            raise ValueError(
-                "Incorrect format: Expected (<json_path>, <json_value>), but found: "
-                + f"{json_path_value_pair}"
-            )
-
-        json_path, json_value = json_path_value_pair
-        series = series._apply_binary_op(
-            json_value, ops.JSONSet(json_path=json_path), alignment="left"
-        )
-    return series
-
-
-def json_extract(
-    series: series.Series,
-    json_path: str,
-) -> series.Series:
-    """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
-    value. This function uses single quotes and brackets to escape invalid JSONPath
-    characters in JSON keys.
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> bpd.options.display.progress_bar = None
-
-        >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
-        >>> bbq.json_extract(s, json_path="$.class")
-        0    {"students":[{"id":5},{"id":12}]}
-        dtype: string
-
-    Args:
-        series (bigframes.series.Series):
-            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
-        json_path (str):
-            The JSON path identifying the data that you want to obtain from the input.
-
-    Returns:
-        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
-    """
-    return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
-
-
-def json_extract_array(
-    series: series.Series,
-    json_path: str = "$",
-) -> series.Series:
-    """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
-    values. This function uses single quotes and brackets to escape invalid JSONPath
-    characters in JSON keys.
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> bpd.options.display.progress_bar = None
-
-        >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
-        >>> bbq.json_extract_array(s)
-        0    ['1' '2' '3']
-        1        ['4' '5']
-        dtype: list<item: string>[pyarrow]
-
-    Args:
-        series (bigframes.series.Series):
-            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
-        json_path (str):
-            The JSON path identifying the data that you want to obtain from the input.
-
-    Returns:
-        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
-    """
-    return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
-
-
-# Approximate aggrgate functions defined from
-# https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions
-
-
-def approx_top_count(
-    series: series.Series,
-    number: int,
-) -> series.Series:
-    """Returns the approximate top elements of `expression` as an array of STRUCTs.
-    The number parameter specifies the number of elements returned.
-
-    Each `STRUCT` contains two fields. The first field (named `value`) contains an input
-    value. The second field (named `count`) contains an `INT64` specifying the number
-    of times the value was returned.
-
-    Returns `NULL` if there are zero input rows.
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> bpd.options.display.progress_bar = None
-        >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"])
-        >>> bbq.approx_top_count(s, number=2)
-        [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}]
-
-    Args:
-        series (bigframes.series.Series):
-            The Series with any data type that the `GROUP BY` clause supports.
-        number (int):
-            An integer specifying the number of times the value was returned.
-
-    Returns:
-        bigframes.series.Series: A new Series with the result data.
-    """
-    if number < 1:
-        raise ValueError("The number of approx_top_count must be at least 1")
-    return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number))
-
-
-def struct(value: dataframe.DataFrame) -> series.Series:
-    """Takes a DataFrame and converts it into a Series of structs with each
-    struct entry corresponding to a DataFrame row and each struct field
-    corresponding to a DataFrame column
-
-    **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> import bigframes.series as series
-        >>> bpd.options.display.progress_bar = None
-
-        >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},])
-        >>> df = srs.struct.explode()
-        >>> bbq.struct(df)
-        0    {'project': 'pandas', 'version': 1}
-        1     {'project': 'numpy', 'version': 2}
-        dtype: struct<project: string, version: int64>[pyarrow]
-
-        Args:
-            value (bigframes.dataframe.DataFrame):
-                The DataFrame to be converted to a Series of structs
-
-        Returns:
-            bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
-    """
-    block = value._block
-    block, result_id = block.apply_nary_op(
-        block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
-    )
-    block = block.select_column(result_id)
-    return bigframes.series.Series(block)
-
-
-# Search functions defined from
-# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
-
-
-def vector_search(
-    base_table: str,
-    column_to_search: str,
-    query: Union[dataframe.DataFrame, series.Series],
-    *,
-    query_column_to_search: Optional[str] = None,
-    top_k: Optional[int] = 10,
-    distance_type: Literal["euclidean", "cosine"] = "euclidean",
-    fraction_lists_to_search: Optional[float] = None,
-    use_brute_force: bool = False,
-) -> dataframe.DataFrame:
-    """
-    Conduct vector search which searches embeddings to find semantically similar entities.
-
-    **Examples:**
-
-
-        >>> import bigframes.pandas as bpd
-        >>> import bigframes.bigquery as bbq
-        >>> bpd.options.display.progress_bar = None
-
-    DataFrame embeddings for which to find nearest neighbors. The ``ARRAY<FLOAT64>`` column
-    is used as the search query:
-
-        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
-        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]]})
-        >>> bbq.vector_search(
-        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
-        ...             column_to_search="my_embedding",
-        ...             query=search_query,
-        ...             top_k=2)
-          query_id  embedding  id my_embedding  distance
-        1      cat  [3.  5.2]   5    [5.  5.4]  2.009975
-        0      dog    [1. 2.]   1      [1. 2.]       0.0
-        0      dog    [1. 2.]   4    [1.  3.2]       1.2
-        1      cat  [3.  5.2]   2      [2. 4.]   1.56205
-        <BLANKLINE>
-        [4 rows x 5 columns]
-
-    Series embeddings for which to find nearest neighbors:
-
-        >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]],
-        ...                            index=["dog", "cat"],
-        ...                            name="embedding")
-        >>> bbq.vector_search(
-        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
-        ...             column_to_search="my_embedding",
-        ...             query=search_query,
-        ...             top_k=2)
-             embedding  id my_embedding  distance
-        dog    [1. 2.]   1      [1. 2.]       0.0
-        cat  [3.  5.2]   5    [5.  5.4]  2.009975
-        dog    [1. 2.]   4    [1.  3.2]       1.2
-        cat  [3.  5.2]   2      [2. 4.]   1.56205
-        <BLANKLINE>
-        [4 rows x 4 columns]
-
-    You can specify the name of the column in the query DataFrame embeddings and distance type.
-    If you specify query_column_to_search_value, it will use the provided column which contains
-    the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value.
-
-        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
-        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]],
-        ...                               "another_embedding": [[0.7, 2.2], [3.3, 5.2]]})
-        >>> bbq.vector_search(
-        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
-        ...             column_to_search="my_embedding",
-        ...             query=search_query,
-        ...             distance_type="cosine",
-        ...             query_column_to_search="another_embedding",
-        ...             top_k=2)
-          query_id  embedding another_embedding  id my_embedding  distance
-        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
-        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
-        1      cat  [3.  5.2]         [3.3 5.2]   1      [1. 2.]  0.005181
-        0      dog    [1. 2.]         [0.7 2.2]   3    [1.5 7. ]  0.004697
-        <BLANKLINE>
-        [4 rows x 6 columns]
-
-    Args:
-        base_table (str):
-            The table to search for nearest neighbor embeddings.
-        column_to_search (str):
-            The name of the base table column to search for nearest neighbor embeddings.
-            The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
-        query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
-            A Series or DataFrame that provides the embeddings for which to find nearest neighbors.
-        query_column_to_search (str):
-            Specifies the name of the column in the query that contains the embeddings for which to
-            find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in
-            the array must be non-NULL and all values in the column must have the same array dimensions
-            as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame.
-        top_k (int, default 10):
-            Sepecifies the number of nearest neighbors to return. Default to 10.
-        distance_type (str, defalt "euclidean"):
-            Specifies the type of metric to use to compute the distance between two vectors.
-            Possible values are "euclidean" and "cosine". Default to "euclidean".
-        fraction_lists_to_search (float, range in [0.0, 1.0]):
-            Specifies the percentage of lists to search. Specifying a higher percentage leads to
-            higher recall and slower performance, and the converse is true when specifying a lower
-            percentage. It is only used when a vector index is also used. You can only specify
-            ``fraction_lists_to_search`` when ``use_brute_force`` is set to False.
-        use_brute_force (bool, default False):
-            Determines whether to use brute force search by skipping the vector index if one is available.
-            Default to False.
-
-    Returns:
-        bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
-    """
-    if not fraction_lists_to_search and use_brute_force is True:
-        raise ValueError(
-            "You can't specify fraction_lists_to_search when use_brute_force is set to True."
-        )
-    if (
-        isinstance(query, bigframes.series.Series)
-        and query_column_to_search is not None
-    ):
-        raise ValueError(
-            "You can't specify query_column_to_search when query is a Series."
-        )
-    # TODO(ashleyxu): Support options in vector search. b/344019989
-    if fraction_lists_to_search is not None or use_brute_force is True:
-        raise NotImplementedError(
-            f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"
-        )
-    options = {
-        "base_table": base_table,
-        "column_to_search": column_to_search,
-        "query_column_to_search": query_column_to_search,
-        "distance_type": distance_type,
-        "top_k": top_k,
-        "fraction_lists_to_search": fraction_lists_to_search,
-        "use_brute_force": use_brute_force,
-    }
-
-    (query,) = utils.convert_to_dataframe(query)
-    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
-
-    sql = bigframes.core.sql.create_vector_search_sql(
-        sql_string=sql_string, options=options  # type: ignore
-    )
-    if index_col_ids is not None:
-        df = query._session.read_gbq(sql, index_col=index_col_ids)
-    else:
-        df = query._session.read_gbq(sql)
-    df.index.names = index_labels
-
-    return df
+from bigframes.bigquery._operations.array import array_length, array_agg, array_to_string
+from bigframes.bigquery._operations.approx_agg import approx_top_count
+from bigframes.bigquery._operations.json import json_set, json_extract, json_extract_array
+from bigframes.bigquery._operations.search import create_vector_index, vector_search
+from bigframes.bigquery._operations.struct import struct
+
+
+__all__ = [
+    "array_length",
+    "array_agg",
+    "array_to_string",
+    "json_set",
+    "json_extract",
+    "json_extract_array",
+    "approx_top_count",
+    "struct",
+    "create_vector_index",
+    "vector_search",
+]
diff --git a/bigframes/bigquery/_operations/__init__.py b/bigframes/bigquery/_operations/__init__.py
new file mode 100644
index 0000000000..6d5e14bcf4
--- /dev/null
+++ b/bigframes/bigquery/_operations/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py
new file mode 100644
index 0000000000..7717136bf8
--- /dev/null
+++ b/bigframes/bigquery/_operations/approx_agg.py
@@ -0,0 +1,70 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing
+
+import bigframes_vendored.constants as constants
+
+import bigframes.core.groupby as groupby
+import bigframes.core.sql
+import bigframes.operations as ops
+import bigframes.operations.aggregations as agg_ops
+
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+    import bigframes.series as series
+
+
+"""
+Approximate functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions
+"""
+
+def approx_top_count(
+    series: series.Series,
+    number: int,
+) -> series.Series:
+    """Returns the approximate top elements of `expression` as an array of STRUCTs.
+    The number parameter specifies the number of elements returned.
+
+    Each `STRUCT` contains two fields. The first field (named `value`) contains an input
+    value. The second field (named `count`) contains an `INT64` specifying the number
+    of times the value was returned.
+
+    Returns `NULL` if there are zero input rows.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+        >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"])
+        >>> bbq.approx_top_count(s, number=2)
+        [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}]
+
+    Args:
+        series (bigframes.series.Series):
+            The Series with any data type that the `GROUP BY` clause supports.
+        number (int):
+            An integer specifying the number of times the value was returned.
+
+    Returns:
+        bigframes.series.Series: A new Series with the result data.
+    """
+    if number < 1:
+        raise ValueError("The number of approx_top_count must be at least 1")
+    return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number))
+
diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py
new file mode 100644
index 0000000000..e43063af7d
--- /dev/null
+++ b/bigframes/bigquery/_operations/array.py
@@ -0,0 +1,150 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Array functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
+"""
+
+
+from __future__ import annotations
+
+import typing
+
+import bigframes_vendored.constants as constants
+
+import bigframes.core.groupby as groupby
+import bigframes.operations as ops
+import bigframes.operations.aggregations as agg_ops
+
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+    import bigframes.series as series
+
+def array_length(series: series.Series) -> series.Series:
+    """Compute the length of each array element in the Series.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]])
+        >>> bbq.array_length(s)
+        0    4
+        1    0
+        2    2
+        dtype: Int64
+
+    You can also apply this function directly to Series.
+
+        >>> s.apply(bbq.array_length, by_row=False)
+        0    4
+        1    0
+        2    2
+        dtype: Int64
+
+    Args:
+        series (bigframes.series.Series): A Series with array columns.
+
+    Returns:
+        bigframes.series.Series: A Series of integer values indicating
+            the length of each element in the Series.
+
+    """
+    return series._apply_unary_op(ops.len_op)
+
+
+def array_agg(
+    obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
+) -> series.Series | dataframe.DataFrame:
+    """Group data and create arrays from selected columns, omitting NULLs to avoid
+    BigQuery errors (NULLs not allowed in arrays).
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+    For a SeriesGroupBy object:
+
+        >>> lst = ['a', 'a', 'b', 'b', 'a']
+        >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst)
+        >>> bbq.array_agg(s.groupby(level=0))
+        a    [1. 2.]
+        b    [3. 4.]
+        dtype: list<item: double>[pyarrow]
+
+    For a DataFrameGroupBy object:
+
+        >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+        >>> df = bpd.DataFrame(l, columns=["a", "b", "c"])
+        >>> bbq.array_agg(df.groupby(by=["b"]))
+                 a      c
+        b
+        1.0    [2]    [3]
+        2.0  [1 1]  [3 2]
+        <BLANKLINE>
+        [2 rows x 2 columns]
+
+    Args:
+        obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
+            A GroupBy object to be applied the function.
+
+    Returns:
+        bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
+            DataFrame containing aggregated array columns, and indexed by the
+            original group columns.
+    """
+    if isinstance(obj, groupby.SeriesGroupBy):
+        return obj._aggregate(agg_ops.ArrayAggOp())
+    elif isinstance(obj, groupby.DataFrameGroupBy):
+        return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False)
+    else:
+        raise ValueError(
+            f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
+        )
+
+
+def array_to_string(series: series.Series, delimiter: str) -> series.Series:
+    """Converts array elements within a Series into delimited strings.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
+        >>> bbq.array_to_string(s, delimiter=", ")
+            0         H, i, !
+            1    Hello, World
+            2
+            3
+            4              Hi
+            dtype: string
+
+    Args:
+        series (bigframes.series.Series): A Series containing arrays.
+        delimiter (str): The string used to separate array elements.
+
+    Returns:
+        bigframes.series.Series: A Series containing delimited strings.
+
+    """
+    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
\ No newline at end of file
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
new file mode 100644
index 0000000000..197406e07a
--- /dev/null
+++ b/bigframes/bigquery/_operations/json.py
@@ -0,0 +1,145 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+JSON functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
+"""
+
+
+from __future__ import annotations
+
+import typing
+
+import bigframes_vendored.constants as constants
+
+import bigframes.core.groupby as groupby
+import bigframes.core.sql
+import bigframes.operations as ops
+import bigframes.operations.aggregations as agg_ops
+
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+    import bigframes.series as series
+
+
+def json_set(
+    series: series.Series,
+    json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]],
+) -> series.Series:
+    """Produces a new JSON value within a Series by inserting or replacing values at
+    specified paths.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
+        >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
+            0    {"a":100,"b":"hi"}
+            Name: data, dtype: string
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path_value_pairs (Sequence[Tuple[str, typing.Any]]):
+            Pairs of JSON path and the new value to insert/replace.
+
+    Returns:
+        bigframes.series.Series: A new Series with the transformed JSON data.
+
+    """
+    # SQLGlot parser does not support the "create_if_missing => true" syntax, so
+    # create_if_missing is not currently implemented.
+
+    for json_path_value_pair in json_path_value_pairs:
+        if len(json_path_value_pair) != 2:
+            raise ValueError(
+                "Incorrect format: Expected (<json_path>, <json_value>), but found: "
+                + f"{json_path_value_pair}"
+            )
+
+        json_path, json_value = json_path_value_pair
+        series = series._apply_binary_op(
+            json_value, ops.JSONSet(json_path=json_path), alignment="left"
+        )
+    return series
+
+
+def json_extract(
+    series: series.Series,
+    json_path: str,
+) -> series.Series:
+    """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
+    value. This function uses single quotes and brackets to escape invalid JSONPath
+    characters in JSON keys.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
+        >>> bbq.json_extract(s, json_path="$.class")
+        0    {"students":[{"id":5},{"id":12}]}
+        dtype: string
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+
+    Returns:
+        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
+    """
+    return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
+
+
+def json_extract_array(
+    series: series.Series,
+    json_path: str = "$",
+) -> series.Series:
+    """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
+    values. This function uses single quotes and brackets to escape invalid JSONPath
+    characters in JSON keys.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
+        >>> bbq.json_extract_array(s)
+        0    ['1' '2' '3']
+        1        ['4' '5']
+        dtype: list<item: string>[pyarrow]
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+
+    Returns:
+        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
+    """
+    return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
+
diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py
new file mode 100644
index 0000000000..640dc44791
--- /dev/null
+++ b/bigframes/bigquery/_operations/search.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import typing
+from typing import Collection, Literal, Mapping, Optional, Union
+
+import bigframes_vendored.constants as constants
+
+import bigframes.core.sql
+import bigframes.ml.utils as utils
+
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+    import bigframes.series as series
+    import bigframes.session
+
+"""
+Search functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
+"""
+
+
+def create_vector_index(
+    table_id: str,
+    column_name: str,
+    *,
+    replace: bool = False,
+    index_name: Optional[str]= None,
+    distance_type="cosine",
+    stored_column_names: Collection[str] = (),
+    index_type: str = "ivf",
+    ivf_options: Optional[Mapping] = None,
+    tree_ah_options: Optional[Mapping] = None,
+    session: Optional[bigframes.session.Session] = None,
+) -> None:
+    """
+    Creates a new vector index on a column of a table.
+
+    This method calls the `CREATE VECTOR INDEX DDL statement
+    <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_vector_index_statement>`_.
+    
+    """
+    import bigframes.pandas
+
+    if index_name is None:
+        index_name = table_id.split(".")[-1]
+    
+    options = {
+        "index_type": index_type.upper(),
+        "distance_type": distance_type.upper(),
+    }
+    
+    if ivf_options is not None:
+        options["ivf_options"] = json.dumps(ivf_options)
+    
+    if tree_ah_options is not None:
+        options["tree_ah_options"] = json.dumps(tree_ah_options)
+
+    sql = bigframes.core.sql.create_vector_index_ddl(
+        replace=replace,
+        index_name=index_name,
+        table_name=table_id,
+        column_name=column_name,
+        stored_column_names=stored_column_names,
+        options=options,
+    )
+
+    # Use global read_gbq to execute this for better location autodetection.
+    if session is None:
+        read_gbq_query = bigframes.pandas.read_gbq_query
+    else:
+        read_gbq_query = session.read_gbq_query
+
+    read_gbq_query(sql)
+    
+
+
+def vector_search(
+    base_table: str,
+    column_to_search: str,
+    query: Union[dataframe.DataFrame, series.Series],
+    *,
+    query_column_to_search: Optional[str] = None,
+    top_k: Optional[int] = 10,
+    distance_type: Literal["euclidean", "cosine"] = "euclidean",
+    fraction_lists_to_search: Optional[float] = None,
+    use_brute_force: bool = False,
+) -> dataframe.DataFrame:
+    """
+    Conduct vector search which searches embeddings to find semantically similar entities.
+
+    This method calls the `VECTOR_SEARCH() SQL function
+    <https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions#vector_search>`_.
+
+    **Examples:**
+
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+    DataFrame embeddings for which to find nearest neighbors. The ``ARRAY<FLOAT64>`` column
+    is used as the search query:
+
+        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
+        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]]})
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             top_k=2)
+          query_id  embedding  id my_embedding  distance
+        1      cat  [3.  5.2]   5    [5.  5.4]  2.009975
+        0      dog    [1. 2.]   1      [1. 2.]       0.0
+        0      dog    [1. 2.]   4    [1.  3.2]       1.2
+        1      cat  [3.  5.2]   2      [2. 4.]   1.56205
+        <BLANKLINE>
+        [4 rows x 5 columns]
+
+    Series embeddings for which to find nearest neighbors:
+
+        >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]],
+        ...                            index=["dog", "cat"],
+        ...                            name="embedding")
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             top_k=2)
+             embedding  id my_embedding  distance
+        dog    [1. 2.]   1      [1. 2.]       0.0
+        cat  [3.  5.2]   5    [5.  5.4]  2.009975
+        dog    [1. 2.]   4    [1.  3.2]       1.2
+        cat  [3.  5.2]   2      [2. 4.]   1.56205
+        <BLANKLINE>
+        [4 rows x 4 columns]
+
+    You can specify the name of the column in the query DataFrame embeddings and distance type.
+    If you specify query_column_to_search_value, it will use the provided column which contains
+    the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value.
+
+        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
+        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]],
+        ...                               "another_embedding": [[0.7, 2.2], [3.3, 5.2]]})
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             distance_type="cosine",
+        ...             query_column_to_search="another_embedding",
+        ...             top_k=2)
+          query_id  embedding another_embedding  id my_embedding  distance
+        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
+        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
+        1      cat  [3.  5.2]         [3.3 5.2]   1      [1. 2.]  0.005181
+        0      dog    [1. 2.]         [0.7 2.2]   3    [1.5 7. ]  0.004697
+        <BLANKLINE>
+        [4 rows x 6 columns]
+
+    Args:
+        base_table (str):
+            The table to search for nearest neighbor embeddings.
+        column_to_search (str):
+            The name of the base table column to search for nearest neighbor embeddings.
+            The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
+        query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
+            A Series or DataFrame that provides the embeddings for which to find nearest neighbors.
+        query_column_to_search (str):
+            Specifies the name of the column in the query that contains the embeddings for which to
+            find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in
+            the array must be non-NULL and all values in the column must have the same array dimensions
+            as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame.
+        top_k (int, default 10):
+            Sepecifies the number of nearest neighbors to return. Default to 10.
+        distance_type (str, defalt "euclidean"):
+            Specifies the type of metric to use to compute the distance between two vectors.
+            Possible values are "euclidean" and "cosine". Default to "euclidean".
+        fraction_lists_to_search (float, range in [0.0, 1.0]):
+            Specifies the percentage of lists to search. Specifying a higher percentage leads to
+            higher recall and slower performance, and the converse is true when specifying a lower
+            percentage. It is only used when a vector index is also used. You can only specify
+            ``fraction_lists_to_search`` when ``use_brute_force`` is set to False.
+        use_brute_force (bool, default False):
+            Determines whether to use brute force search by skipping the vector index if one is available.
+            Default to False.
+
+    Returns:
+        bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
+    """
+    import bigframes.series
+
+    if not fraction_lists_to_search and use_brute_force is True:
+        raise ValueError(
+            "You can't specify fraction_lists_to_search when use_brute_force is set to True."
+        )
+    if (
+        isinstance(query, bigframes.series.Series)
+        and query_column_to_search is not None
+    ):
+        raise ValueError(
+            "You can't specify query_column_to_search when query is a Series."
+        )
+    # TODO(ashleyxu): Support options in vector search. b/344019989
+    if fraction_lists_to_search is not None or use_brute_force is True:
+        raise NotImplementedError(
+            f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"
+        )
+    options = {
+        "base_table": base_table,
+        "column_to_search": column_to_search,
+        "query_column_to_search": query_column_to_search,
+        "distance_type": distance_type,
+        "top_k": top_k,
+        "fraction_lists_to_search": fraction_lists_to_search,
+        "use_brute_force": use_brute_force,
+    }
+
+    (query,) = utils.convert_to_dataframe(query)
+    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
+
+    sql = bigframes.core.sql.create_vector_search_sql(
+        sql_string=sql_string, options=options  # type: ignore
+    )
+    if index_col_ids is not None:
+        df = query._session.read_gbq(sql, index_col=index_col_ids)
+    else:
+        df = query._session.read_gbq(sql)
+    df.index.names = index_labels
+
+    return df
\ No newline at end of file
diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py
new file mode 100644
index 0000000000..1c77befc8e
--- /dev/null
+++ b/bigframes/bigquery/_operations/struct.py
@@ -0,0 +1,70 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""This module integrates BigQuery built-in functions for use with DataFrame objects,
+such as array functions:
+https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """
+
+
+from __future__ import annotations
+
+import typing
+
+import bigframes_vendored.constants as constants
+
+import bigframes.core.groupby as groupby
+import bigframes.core.sql
+import bigframes.operations as ops
+import bigframes.operations.aggregations as agg_ops
+
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+    import bigframes.series as series
+
+
+def struct(value: dataframe.DataFrame) -> series.Series:
+    """Takes a DataFrame and converts it into a Series of structs with each
+    struct entry corresponding to a DataFrame row and each struct field
+    corresponding to a DataFrame column
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import bigframes.series as series
+        >>> bpd.options.display.progress_bar = None
+
+        >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},])
+        >>> df = srs.struct.explode()
+        >>> bbq.struct(df)
+        0    {'project': 'pandas', 'version': 1}
+        1     {'project': 'numpy', 'version': 2}
+        dtype: struct<project: string, version: int64>[pyarrow]
+
+        Args:
+            value (bigframes.dataframe.DataFrame):
+                The DataFrame to be converted to a Series of structs
+
+        Returns:
+            bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
+    """
+    import bigframes.series
+    
+    block = value._block
+    block, result_id = block.apply_nary_op(
+        block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
+    )
+    block = block.select_column(result_id)
+    return bigframes.series.Series(block)
\ No newline at end of file
diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index 528c9bcc74..a90890d2b4 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -19,7 +19,7 @@
 
 import datetime
 import math
-from typing import Iterable, Mapping, TYPE_CHECKING, Union
+from typing import Collection, Iterable, Mapping, TYPE_CHECKING, Union
 
 import bigframes.core.compile.googlesql as googlesql
 
@@ -118,6 +118,43 @@ def ordering_clause(
     return f"ORDER BY {' ,'.join(parts)}"
 
 
+def create_vector_index_ddl(
+    *,
+    replace: bool,
+    index_name: str,
+    table_name: str,
+    column_name: str,
+    stored_column_names: Collection[str],
+    options: Mapping[str, Union[str | int | bool | float]] = {},
+) -> str:
+    """Encode the VECTOR INDEX statement for BigQuery Vector Search."""
+
+    if replace:
+        create = "CREATE OR REPLACE VECTOR INDEX "
+    else:
+        create = "CREATE VECTOR INDEX IF NOT EXISTS "
+
+    if len(stored_column_names) > 0:
+        escaped_stored = [f"`{name}`" for name in stored_column_names]
+        storing = f"STORING({', '.join(escaped_stored)}) "
+    else:
+        storing = ""
+    
+    options = ", ".join([
+        f"{option_name} = {simple_literal(option_value)}"
+        for option_name, option_value
+        in options.items()
+    ])
+
+    return f"""
+    {create} {index_name}
+    ON `{table_name}`(`{column_name}`)
+    {storing}
+    OPTIONS({options});
+    """
+
+
+
 def create_vector_search_sql(
     sql_string: str,
     options: Mapping[str, Union[str | int | bool | float]] = {},
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index d9246eecfb..840b9fcfa7 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -240,6 +240,11 @@ def dataset_id_permanent_tokyo(
     return dataset_id
 
 
+@pytest.fixture(scope="session")
+def table_id_not_created(dataset_id: str):
+    return f"{dataset_id}.{prefixer.create_prefix()}"
+
+
 @pytest.fixture(scope="session")
 def scalars_schema(bigquery_client: bigquery.Client):
     # TODO(swast): Add missing scalar data types such as BIGNUMERIC.
diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index 4280c0a888..bbe302f0de 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -12,13 +12,89 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import random
+
+import google.cloud.bigquery
 import numpy as np
 import pandas as pd
+import pyarrow
+import pytest
 
 import bigframes.bigquery as bbq
 import bigframes.pandas as bpd
 
 
+# Need at least 5,000 rows to create a vector index.
+VECTOR_DF = pd.DataFrame(
+    {
+        "rowid": np.arange(9_999),
+        # 3D values, clustered around the three unit vector axes.
+        "my_embedding": pd.Series(
+            [
+                [
+                    1 + (random.random() - 0.5) if (row % 3) == 0 else 0,
+                    1 + (random.random() - 0.5) if (row % 3) == 1 else 0,
+                    1 + (random.random() - 0.5) if (row % 3) == 2 else 0,
+                ]
+                for row in range(9_999)
+            ],
+            dtype=pd.ArrowDtype(pyarrow.list_(pyarrow.float64())),
+        ),
+        # Three groups of animal, vegetable, and mineral, corresponding to
+        # the embeddings above.
+        "mystery_word": [
+            "aarvark", "broccoli", "calcium",
+            "dog", "eggplant", "ferrite",
+            "gopher", "huckleberry", "ice",
+        ] * 1_111,
+    },
+)
+
+
+@pytest.fixture
+def vector_table_id(
+    bigquery_client: google.cloud.bigquery.Client,
+    # Use non-US location to ensure location autodetection works.
+    table_id_not_created: str,
+):
+    bigquery_client.load_table_from_dataframe(VECTOR_DF, table_id_not_created).result()
+    yield table_id_not_created
+    bigquery_client.delete_table(table_id_not_created, not_found_ok=True)
+
+
+def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client):
+    bbq.create_vector_index(
+        vector_table_id,
+        "my_embedding",
+        distance_type="cosine",
+        stored_column_names=["mystery_word"],
+        index_type="ivf",
+        ivf_options={"num_lists": 3},
+        session=session,
+    )
+
+    # Check that the index was created successfully.
+    project_id, dataset_id, table_name = vector_table_id.split(".")
+    indexes = bigquery_client.query_and_wait(
+        f"""
+        SELECT index_catalog, index_schema, table_name, index_name, index_column_name
+        FROM `{project_id}`.`{dataset_id}`.INFORMATION_SCHEMA.VECTOR_INDEX_COLUMNS
+        WHERE table_name = '{table_name}';
+        """
+    ).to_dataframe()
+
+    # There should only be one vector index.
+    assert len(indexes.index) == 1
+    assert indexes["index_catalog"].iloc[0] == project_id
+    assert indexes["index_schema"].iloc[0] == dataset_id
+    assert indexes["table_name"].iloc[0] == table_name
+    assert indexes["index_column_name"].iloc[0] == "my_embedding"
+
+    # If no name is specified, use the table name as the index name
+    assert indexes["index_name"].iloc[0] == table_name
+
+
+
 def test_vector_search_basic_params_with_df():
     search_query = bpd.DataFrame(
         {

From 6e537d1f92082e6a50d15e72c66f2ca387042be0 Mon Sep 17 00:00:00 2001
From: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Date: Thu, 26 Sep 2024 20:56:56 +0000
Subject: [PATCH 02/10] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?=
 =?UTF-8?q?post-processor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md
---
 bigframes/bigquery/__init__.py                | 13 +++++++++---
 bigframes/bigquery/_operations/approx_agg.py  |  2 +-
 bigframes/bigquery/_operations/array.py       |  3 ++-
 bigframes/bigquery/_operations/json.py        |  1 -
 bigframes/bigquery/_operations/search.py      | 13 ++++++------
 bigframes/bigquery/_operations/struct.py      |  4 ++--
 bigframes/core/sql.py                         | 14 ++++++-------
 .../small/bigquery/test_vector_search.py      | 21 ++++++++++++-------
 8 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 9953a946d8..0b2d2d5aeb 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -16,13 +16,20 @@
 such as array functions:
 https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """
 
-from bigframes.bigquery._operations.array import array_length, array_agg, array_to_string
 from bigframes.bigquery._operations.approx_agg import approx_top_count
-from bigframes.bigquery._operations.json import json_set, json_extract, json_extract_array
+from bigframes.bigquery._operations.array import (
+    array_agg,
+    array_length,
+    array_to_string,
+)
+from bigframes.bigquery._operations.json import (
+    json_extract,
+    json_extract_array,
+    json_set,
+)
 from bigframes.bigquery._operations.search import create_vector_index, vector_search
 from bigframes.bigquery._operations.struct import struct
 
-
 __all__ = [
     "array_length",
     "array_agg",
diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py
index 7717136bf8..744893ef64 100644
--- a/bigframes/bigquery/_operations/approx_agg.py
+++ b/bigframes/bigquery/_operations/approx_agg.py
@@ -33,6 +33,7 @@
 https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions
 """
 
+
 def approx_top_count(
     series: series.Series,
     number: int,
@@ -67,4 +68,3 @@ def approx_top_count(
     if number < 1:
         raise ValueError("The number of approx_top_count must be at least 1")
     return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number))
-
diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py
index e43063af7d..afb5c93bbc 100644
--- a/bigframes/bigquery/_operations/array.py
+++ b/bigframes/bigquery/_operations/array.py
@@ -32,6 +32,7 @@
     import bigframes.dataframe as dataframe
     import bigframes.series as series
 
+
 def array_length(series: series.Series) -> series.Series:
     """Compute the length of each array element in the Series.
 
@@ -147,4 +148,4 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
         bigframes.series.Series: A Series containing delimited strings.
 
     """
-    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
\ No newline at end of file
+    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
index 197406e07a..dc3efc0200 100644
--- a/bigframes/bigquery/_operations/json.py
+++ b/bigframes/bigquery/_operations/json.py
@@ -142,4 +142,3 @@ def json_extract_array(
         bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
     """
     return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
-
diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py
index 640dc44791..392538d9da 100644
--- a/bigframes/bigquery/_operations/search.py
+++ b/bigframes/bigquery/_operations/search.py
@@ -39,7 +39,7 @@ def create_vector_index(
     column_name: str,
     *,
     replace: bool = False,
-    index_name: Optional[str]= None,
+    index_name: Optional[str] = None,
     distance_type="cosine",
     stored_column_names: Collection[str] = (),
     index_type: str = "ivf",
@@ -52,21 +52,21 @@ def create_vector_index(
 
     This method calls the `CREATE VECTOR INDEX DDL statement
     <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_vector_index_statement>`_.
-    
+
     """
     import bigframes.pandas
 
     if index_name is None:
         index_name = table_id.split(".")[-1]
-    
+
     options = {
         "index_type": index_type.upper(),
         "distance_type": distance_type.upper(),
     }
-    
+
     if ivf_options is not None:
         options["ivf_options"] = json.dumps(ivf_options)
-    
+
     if tree_ah_options is not None:
         options["tree_ah_options"] = json.dumps(tree_ah_options)
 
@@ -86,7 +86,6 @@ def create_vector_index(
         read_gbq_query = session.read_gbq_query
 
     read_gbq_query(sql)
-    
 
 
 def vector_search(
@@ -241,4 +240,4 @@ def vector_search(
         df = query._session.read_gbq(sql)
     df.index.names = index_labels
 
-    return df
\ No newline at end of file
+    return df
diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py
index 1c77befc8e..bb8082458e 100644
--- a/bigframes/bigquery/_operations/struct.py
+++ b/bigframes/bigquery/_operations/struct.py
@@ -61,10 +61,10 @@ def struct(value: dataframe.DataFrame) -> series.Series:
             bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
     """
     import bigframes.series
-    
+
     block = value._block
     block, result_id = block.apply_nary_op(
         block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
     )
     block = block.select_column(result_id)
-    return bigframes.series.Series(block)
\ No newline at end of file
+    return bigframes.series.Series(block)
diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index a90890d2b4..0456474b41 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -139,12 +139,13 @@ def create_vector_index_ddl(
         storing = f"STORING({', '.join(escaped_stored)}) "
     else:
         storing = ""
-    
-    options = ", ".join([
-        f"{option_name} = {simple_literal(option_value)}"
-        for option_name, option_value
-        in options.items()
-    ])
+
+    options = ", ".join(
+        [
+            f"{option_name} = {simple_literal(option_value)}"
+            for option_name, option_value in options.items()
+        ]
+    )
 
     return f"""
     {create} {index_name}
@@ -154,7 +155,6 @@ def create_vector_index_ddl(
     """
 
 
-
 def create_vector_search_sql(
     sql_string: str,
     options: Mapping[str, Union[str | int | bool | float]] = {},
diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index bbe302f0de..6a259779d8 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -23,7 +23,6 @@
 import bigframes.bigquery as bbq
 import bigframes.pandas as bpd
 
-
 # Need at least 5,000 rows to create a vector index.
 VECTOR_DF = pd.DataFrame(
     {
@@ -43,10 +42,17 @@
         # Three groups of animal, vegetable, and mineral, corresponding to
         # the embeddings above.
         "mystery_word": [
-            "aarvark", "broccoli", "calcium",
-            "dog", "eggplant", "ferrite",
-            "gopher", "huckleberry", "ice",
-        ] * 1_111,
+            "aarvark",
+            "broccoli",
+            "calcium",
+            "dog",
+            "eggplant",
+            "ferrite",
+            "gopher",
+            "huckleberry",
+            "ice",
+        ]
+        * 1_111,
     },
 )
 
@@ -62,7 +68,9 @@ def vector_table_id(
     bigquery_client.delete_table(table_id_not_created, not_found_ok=True)
 
 
-def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client):
+def test_create_vector_index_ivf(
+    session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client
+):
     bbq.create_vector_index(
         vector_table_id,
         "my_embedding",
@@ -94,7 +102,6 @@ def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client:
     assert indexes["index_name"].iloc[0] == table_name
 
 
-
 def test_vector_search_basic_params_with_df():
     search_query = bpd.DataFrame(
         {

From 0efaf7e0071aad404b7ab1ae73c696fcef0655f6 Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Mon, 14 Oct 2024 11:08:49 -0500
Subject: [PATCH 03/10] fix lint errors

---
 bigframes/bigquery/__init__.py                | 13 +++++++++---
 bigframes/bigquery/_operations/approx_agg.py  | 15 ++-----------
 bigframes/bigquery/_operations/array.py       |  3 ++-
 bigframes/bigquery/_operations/json.py        | 15 +++----------
 bigframes/bigquery/_operations/search.py      | 13 ++++++------
 bigframes/bigquery/_operations/struct.py      | 11 ++--------
 bigframes/core/sql.py                         | 14 ++++++-------
 .../small/bigquery/test_vector_search.py      | 21 ++++++++++++-------
 8 files changed, 46 insertions(+), 59 deletions(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 9953a946d8..0b2d2d5aeb 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -16,13 +16,20 @@
 such as array functions:
 https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """
 
-from bigframes.bigquery._operations.array import array_length, array_agg, array_to_string
 from bigframes.bigquery._operations.approx_agg import approx_top_count
-from bigframes.bigquery._operations.json import json_set, json_extract, json_extract_array
+from bigframes.bigquery._operations.array import (
+    array_agg,
+    array_length,
+    array_to_string,
+)
+from bigframes.bigquery._operations.json import (
+    json_extract,
+    json_extract_array,
+    json_set,
+)
 from bigframes.bigquery._operations.search import create_vector_index, vector_search
 from bigframes.bigquery._operations.struct import struct
 
-
 __all__ = [
     "array_length",
     "array_agg",
diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py
index 7717136bf8..696f8f5a66 100644
--- a/bigframes/bigquery/_operations/approx_agg.py
+++ b/bigframes/bigquery/_operations/approx_agg.py
@@ -14,25 +14,15 @@
 
 from __future__ import annotations
 
-import typing
-
-import bigframes_vendored.constants as constants
-
-import bigframes.core.groupby as groupby
-import bigframes.core.sql
-import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
-
-if typing.TYPE_CHECKING:
-    import bigframes.dataframe as dataframe
-    import bigframes.series as series
-
+import bigframes.series as series
 
 """
 Approximate functions defined from
 https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions
 """
 
+
 def approx_top_count(
     series: series.Series,
     number: int,
@@ -67,4 +57,3 @@ def approx_top_count(
     if number < 1:
         raise ValueError("The number of approx_top_count must be at least 1")
     return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number))
-
diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py
index 31db56b832..4af1416127 100644
--- a/bigframes/bigquery/_operations/array.py
+++ b/bigframes/bigquery/_operations/array.py
@@ -32,6 +32,7 @@
 if typing.TYPE_CHECKING:
     import bigframes.dataframe as dataframe
 
+
 def array_length(series: series.Series) -> series.Series:
     """Compute the length of each array element in the Series.
 
@@ -147,4 +148,4 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
         bigframes.series.Series: A Series containing delimited strings.
 
     """
-    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
\ No newline at end of file
+    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
index 6f676ed9a5..d3c3c97a9c 100644
--- a/bigframes/bigquery/_operations/json.py
+++ b/bigframes/bigquery/_operations/json.py
@@ -21,23 +21,15 @@
 
 from __future__ import annotations
 
-import typing
+from typing import Any, Sequence, Tuple
 
-import bigframes_vendored.constants as constants
-
-import bigframes.core.groupby as groupby
-import bigframes.core.sql
 import bigframes.operations as ops
-import bigframes.operations.aggregations as agg_ops
 import bigframes.series as series
 
-if typing.TYPE_CHECKING:
-    import bigframes.dataframe as dataframe
-
 
 def json_set(
     series: series.Series,
-    json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]],
+    json_path_value_pairs: Sequence[Tuple[str, Any]],
 ) -> series.Series:
     """Produces a new JSON value within a Series by inserting or replacing values at
     specified paths.
@@ -57,7 +49,7 @@ def json_set(
     Args:
         series (bigframes.series.Series):
             The Series containing JSON data (as native JSON objects or JSON-formatted strings).
-        json_path_value_pairs (Sequence[Tuple[str, typing.Any]]):
+        json_path_value_pairs (Sequence[Tuple[str, Any]]):
             Pairs of JSON path and the new value to insert/replace.
 
     Returns:
@@ -142,4 +134,3 @@ def json_extract_array(
         bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
     """
     return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
-
diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py
index 47fce80492..ea5ff13630 100644
--- a/bigframes/bigquery/_operations/search.py
+++ b/bigframes/bigquery/_operations/search.py
@@ -39,7 +39,7 @@ def create_vector_index(
     column_name: str,
     *,
     replace: bool = False,
-    index_name: Optional[str]= None,
+    index_name: Optional[str] = None,
     distance_type="cosine",
     stored_column_names: Collection[str] = (),
     index_type: str = "ivf",
@@ -52,21 +52,21 @@ def create_vector_index(
 
     This method calls the `CREATE VECTOR INDEX DDL statement
     <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_vector_index_statement>`_.
-    
+
     """
     import bigframes.pandas
 
     if index_name is None:
         index_name = table_id.split(".")[-1]
-    
+
     options = {
         "index_type": index_type.upper(),
         "distance_type": distance_type.upper(),
     }
-    
+
     if ivf_options is not None:
         options["ivf_options"] = json.dumps(ivf_options)
-    
+
     if tree_ah_options is not None:
         options["tree_ah_options"] = json.dumps(tree_ah_options)
 
@@ -86,7 +86,6 @@ def create_vector_index(
         read_gbq_query = session.read_gbq_query
 
     read_gbq_query(sql)
-    
 
 
 def vector_search(
@@ -241,4 +240,4 @@ def vector_search(
     else:
         df = query._session.read_gbq(sql)
 
-    return df
\ No newline at end of file
+    return df
diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py
index 1c77befc8e..7cb826351c 100644
--- a/bigframes/bigquery/_operations/struct.py
+++ b/bigframes/bigquery/_operations/struct.py
@@ -22,16 +22,11 @@
 
 import typing
 
-import bigframes_vendored.constants as constants
-
-import bigframes.core.groupby as groupby
-import bigframes.core.sql
 import bigframes.operations as ops
-import bigframes.operations.aggregations as agg_ops
+import bigframes.series as series
 
 if typing.TYPE_CHECKING:
     import bigframes.dataframe as dataframe
-    import bigframes.series as series
 
 
 def struct(value: dataframe.DataFrame) -> series.Series:
@@ -60,11 +55,9 @@ def struct(value: dataframe.DataFrame) -> series.Series:
         Returns:
             bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
     """
-    import bigframes.series
-    
     block = value._block
     block, result_id = block.apply_nary_op(
         block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
     )
     block = block.select_column(result_id)
-    return bigframes.series.Series(block)
\ No newline at end of file
+    return series.Series(block)
diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index 268f52b23a..fac74ec06c 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -137,12 +137,13 @@ def create_vector_index_ddl(
         storing = f"STORING({', '.join(escaped_stored)}) "
     else:
         storing = ""
-    
-    options = ", ".join([
-        f"{option_name} = {simple_literal(option_value)}"
-        for option_name, option_value
-        in options.items()
-    ])
+
+    options = ", ".join(
+        [
+            f"{option_name} = {simple_literal(option_value)}"
+            for option_name, option_value in options.items()
+        ]
+    )
 
     return f"""
     {create} {index_name}
@@ -152,7 +153,6 @@ def create_vector_index_ddl(
     """
 
 
-
 def create_vector_search_sql(
     sql_string: str,
     options: Mapping[str, Union[str | int | bool | float]] = {},
diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index bbe302f0de..6a259779d8 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -23,7 +23,6 @@
 import bigframes.bigquery as bbq
 import bigframes.pandas as bpd
 
-
 # Need at least 5,000 rows to create a vector index.
 VECTOR_DF = pd.DataFrame(
     {
@@ -43,10 +42,17 @@
         # Three groups of animal, vegetable, and mineral, corresponding to
         # the embeddings above.
         "mystery_word": [
-            "aarvark", "broccoli", "calcium",
-            "dog", "eggplant", "ferrite",
-            "gopher", "huckleberry", "ice",
-        ] * 1_111,
+            "aarvark",
+            "broccoli",
+            "calcium",
+            "dog",
+            "eggplant",
+            "ferrite",
+            "gopher",
+            "huckleberry",
+            "ice",
+        ]
+        * 1_111,
     },
 )
 
@@ -62,7 +68,9 @@ def vector_table_id(
     bigquery_client.delete_table(table_id_not_created, not_found_ok=True)
 
 
-def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client):
+def test_create_vector_index_ivf(
+    session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client
+):
     bbq.create_vector_index(
         vector_table_id,
         "my_embedding",
@@ -94,7 +102,6 @@ def test_create_vector_index_ivf(session, vector_table_id: str, bigquery_client:
     assert indexes["index_name"].iloc[0] == table_name
 
 
-
 def test_vector_search_basic_params_with_df():
     search_query = bpd.DataFrame(
         {

From 7c4167700753a63cfa50088df3046a0cd8d9f5f7 Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Mon, 14 Oct 2024 11:10:32 -0500
Subject: [PATCH 04/10] format

---
 bigframes/bigquery/_operations/array.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py
index 3c1d7239e4..4af1416127 100644
--- a/bigframes/bigquery/_operations/array.py
+++ b/bigframes/bigquery/_operations/array.py
@@ -33,7 +33,6 @@
     import bigframes.dataframe as dataframe
 
 
-
 def array_length(series: series.Series) -> series.Series:
     """Compute the length of each array element in the Series.
 

From 7ac22fa7fc15fa443cf6c18ef23c503d002d832e Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Tue, 15 Oct 2024 10:21:33 -0500
Subject: [PATCH 05/10] fix mypy

---
 bigframes/core/sql.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index fac74ec06c..25d5c1dd41 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -138,7 +138,7 @@ def create_vector_index_ddl(
     else:
         storing = ""
 
-    options = ", ".join(
+    rendered_options = ", ".join(
         [
             f"{option_name} = {simple_literal(option_value)}"
             for option_name, option_value in options.items()
@@ -149,7 +149,7 @@ def create_vector_index_ddl(
     {create} {index_name}
     ON `{table_name}`(`{column_name}`)
     {storing}
-    OPTIONS({options});
+    OPTIONS({rendered_options});
     """
 
 

From e704cdae9f6d7c4cd9abe893d1a45bf45971953f Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Tue, 15 Oct 2024 12:27:43 -0500
Subject: [PATCH 06/10] fix test for older google-cloud-bigquery

---
 tests/system/small/bigquery/test_vector_search.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index 6a259779d8..3c3526319c 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -63,7 +63,19 @@ def vector_table_id(
     # Use non-US location to ensure location autodetection works.
     table_id_not_created: str,
 ):
-    bigquery_client.load_table_from_dataframe(VECTOR_DF, table_id_not_created).result()
+    table = google.cloud.bigquery.Table(
+        table_id_not_created,
+        [
+            {"name": "rowid", "type": "INT64"},
+            {"name": "my_embedding", "type": "FLOAT64", "mode": "REPEATED"},
+            {"name": "mystery_word", "type": "STRING"},
+        ],
+    )
+    bigquery_client.create_table(table)
+    bigquery_client.load_table_from_json(
+        VECTOR_DF.to_dict(orient="records"),
+        table_id_not_created,
+    )
     yield table_id_not_created
     bigquery_client.delete_table(table_id_not_created, not_found_ok=True)
 

From 53e23438ab28569d610b9b66b08ce37022e01b62 Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Tue, 15 Oct 2024 12:35:43 -0500
Subject: [PATCH 07/10] fix type error

---
 tests/system/small/bigquery/test_vector_search.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index 3c3526319c..1154f777fd 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import random
+from typing import Any, cast, Iterable, Mapping
 
 import google.cloud.bigquery
 import numpy as np
@@ -73,7 +74,7 @@ def vector_table_id(
     )
     bigquery_client.create_table(table)
     bigquery_client.load_table_from_json(
-        VECTOR_DF.to_dict(orient="records"),
+        cast(Iterable[Mapping[str, Any]], VECTOR_DF.to_dict(orient="records")),
         table_id_not_created,
     )
     yield table_id_not_created

From 77ad1a384b15865fa6e5922379b3aa79eacde9ef Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Tue, 15 Oct 2024 12:52:27 -0500
Subject: [PATCH 08/10] fix typing

---
 tests/system/small/bigquery/test_vector_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index 1154f777fd..06eb8fd680 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import random
-from typing import Any, cast, Iterable, Mapping
+from typing import Any, cast, Dict, Iterable
 
 import google.cloud.bigquery
 import numpy as np
@@ -74,7 +74,7 @@ def vector_table_id(
     )
     bigquery_client.create_table(table)
     bigquery_client.load_table_from_json(
-        cast(Iterable[Mapping[str, Any]], VECTOR_DF.to_dict(orient="records")),
+        cast(Iterable[Dict[str, Any]], VECTOR_DF.to_dict(orient="records")),
         table_id_not_created,
     )
     yield table_id_not_created

From 5cc3d468db58a3c7d610006f726cadd36ae1cc08 Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Wed, 16 Oct 2024 08:35:54 -0500
Subject: [PATCH 09/10] use googlesql.identifier to escape column and table ids

---
 bigframes/bigquery/_operations/search.py |  4 +++-
 bigframes/core/sql.py                    | 14 ++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py
index ea5ff13630..496e259944 100644
--- a/bigframes/bigquery/_operations/search.py
+++ b/bigframes/bigquery/_operations/search.py
@@ -19,6 +19,7 @@
 from typing import Collection, Literal, Mapping, Optional, Union
 
 import bigframes_vendored.constants as constants
+import google.cloud.bigquery as bigquery
 
 import bigframes.core.sql
 import bigframes.ml.utils as utils
@@ -57,7 +58,8 @@ def create_vector_index(
     import bigframes.pandas
 
     if index_name is None:
-        index_name = table_id.split(".")[-1]
+        table_ref = bigquery.TableReference.from_string(table_id)
+        index_name = table_ref.table_id
 
     options = {
         "index_type": index_type.upper(),
diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index 25d5c1dd41..d5dfc64ddd 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -19,7 +19,7 @@
 
 import datetime
 import math
-from typing import Collection, Iterable, Mapping, TYPE_CHECKING, Union
+from typing import cast, Collection, Iterable, Mapping, TYPE_CHECKING, Union
 
 import bigframes.core.compile.googlesql as googlesql
 
@@ -133,7 +133,9 @@ def create_vector_index_ddl(
         create = "CREATE VECTOR INDEX IF NOT EXISTS "
 
     if len(stored_column_names) > 0:
-        escaped_stored = [f"`{name}`" for name in stored_column_names]
+        escaped_stored = [
+            f"{googlesql.identifier(name)}" for name in stored_column_names
+        ]
         storing = f"STORING({', '.join(escaped_stored)}) "
     else:
         storing = ""
@@ -146,8 +148,8 @@ def create_vector_index_ddl(
     )
 
     return f"""
-    {create} {index_name}
-    ON `{table_name}`(`{column_name}`)
+    {create} {googlesql.identifier(index_name)}
+    ON {googlesql.identifier(table_name)}({googlesql.identifier(column_name)})
     {storing}
     OPTIONS({rendered_options});
     """
@@ -172,7 +174,7 @@ def create_vector_search_sql(
         base.*,
         distance,
     FROM VECTOR_SEARCH(
-        TABLE `{base_table}`,
+        TABLE {googlesql.identifier(cast(str, base_table))},
         {simple_literal(column_to_search)},
         ({sql_string}),
         {simple_literal(query_column_to_search)},
@@ -187,7 +189,7 @@ def create_vector_search_sql(
         base.*,
         distance,
     FROM VECTOR_SEARCH(
-        TABLE `{base_table}`,
+        TABLE {googlesql.identifier(cast(str, base_table))},
         {simple_literal(column_to_search)},
         ({sql_string}),
         distance_type => {simple_literal(distance_type)},

From 13badf861ac32162ffbbfcf2850f5957aa8f1817 Mon Sep 17 00:00:00 2001
From: Tim Swena <swast@google.com>
Date: Thu, 17 Oct 2024 16:34:14 -0500
Subject: [PATCH 10/10] wait for job to finish

---
 tests/system/small/bigquery/test_vector_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
index 06eb8fd680..b6a6d59c4c 100644
--- a/tests/system/small/bigquery/test_vector_search.py
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -76,7 +76,7 @@ def vector_table_id(
     bigquery_client.load_table_from_json(
         cast(Iterable[Dict[str, Any]], VECTOR_DF.to_dict(orient="records")),
         table_id_not_created,
-    )
+    ).result()
     yield table_id_not_created
     bigquery_client.delete_table(table_id_not_created, not_found_ok=True)