From 0f763ed8b7ec4cfe30829542697e332f79596a48 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 28 Apr 2025 11:42:17 -0500 Subject: [PATCH] fix: support large lists of lists in bpd.Series() constructor --- bigframes/session/__init__.py | 7 +++ bigframes/session/loader.py | 7 +++ tests/system/small/bigquery/test_array.py | 56 ++++++++++++++++++++--- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6379a6f2e8..3dcae489b0 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1055,6 +1055,13 @@ def read_parquet( if engine == "bigquery": job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.PARQUET + + # Ensure we can load pyarrow.list_ / BQ ARRAY type. + # See internal issue 414374215. + parquet_options = bigquery.ParquetOptions() + parquet_options.enable_list_inference = True + job_config.parquet_options = parquet_options + job_config.labels = {"bigframes-api": "read_parquet"} return self._loader.read_bigquery_load_job(path, job_config=job_config) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 76f12ae438..dc701e919a 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -217,6 +217,13 @@ def load_data( job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.PARQUET + + # Ensure we can load pyarrow.list_ / BQ ARRAY type. + # See internal issue 414374215. + parquet_options = bigquery.ParquetOptions() + parquet_options.enable_list_inference = True + job_config.parquet_options = parquet_options + job_config.schema = bq_schema if api_name: job_config.labels = {"bigframes-api": api_name} diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py index d6823a3a54..2ceb90e22c 100644 --- a/tests/system/small/bigquery/test_array.py +++ b/tests/system/small/bigquery/test_array.py @@ -17,17 +17,61 @@ import pytest import bigframes.bigquery as bbq +import bigframes.dtypes import bigframes.pandas as bpd -def test_array_length(): - series = bpd.Series([["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]]) - # TODO(b/336880368): Allow for NULL values to be input for ARRAY columns. - # Once we actually store NULL values, this will be NULL where the input is NULL. - expected = bpd.Series([3, 2, 0, 0, 1]) +@pytest.mark.parametrize( + ["input_data", "expected"], + [ + pytest.param( + [["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]], + [ + 3, + 2, + # TODO(b/336880368): Allow for NULL values to be input for ARRAY + # columns. Once we actually store NULL values, this will be + # NULL where the input is NULL. + 0, + 0, + 1, + ], + id="small-string", + ), + pytest.param( + [[1, 2, 3], [4, 5], [], [], [6]], [3, 2, 0, 0, 1], id="small-int64" + ), + pytest.param( + [ + # Regression test for b/414374215 where the Series constructor + # returns empty lists when the lists are too big to embed in + # SQL. + list(np.random.randint(-1_000_000, 1_000_000, size=1000)), + list(np.random.randint(-1_000_000, 1_000_000, size=967)), + list(np.random.randint(-1_000_000, 1_000_000, size=423)), + list(np.random.randint(-1_000_000, 1_000_000, size=5000)), + list(np.random.randint(-1_000_000, 1_000_000, size=1003)), + list(np.random.randint(-1_000_000, 1_000_000, size=9999)), + ], + [ + 1000, + 967, + 423, + 5000, + 1003, + 9999, + ], + id="larger-int64", + ), + ], +) +def test_array_length(input_data, expected): + series = bpd.Series(input_data) + expected = pd.Series(expected, dtype=bigframes.dtypes.INT_DTYPE) pd.testing.assert_series_equal( bbq.array_length(series).to_pandas(), - expected.to_pandas(), + expected, + check_index_type=False, )