diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 1330d04589..baa29ba72b 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -212,7 +212,7 @@ def _( value_generator = iter_array( array.flatten(), bigframes.dtypes.get_array_inner_type(dtype) ) - for (start, end) in itertools.pairwise(array.offsets): + for (start, end) in _pairwise(array.offsets): arr_size = end.as_py() - start.as_py() yield list(itertools.islice(value_generator, arr_size)) @@ -389,3 +389,16 @@ def _physical_type_replacements(dtype: pa.DataType) -> pa.DataType: if dtype in _ARROW_MANAGED_STORAGE_OVERRIDES: return _ARROW_MANAGED_STORAGE_OVERRIDES[dtype] return dtype + + +def _pairwise(iterable): + do_yield = False + a = None + b = None + for item in iterable: + a = b + b = item + if do_yield: + yield (a, b) + else: + do_yield = True diff --git a/tests/unit/test_local_data.py b/tests/unit/test_local_data.py new file mode 100644 index 0000000000..9cd08787c9 --- /dev/null +++ b/tests/unit/test_local_data.py @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import pandas.testing +import pyarrow as pa + +from bigframes import dtypes +from bigframes.core import local_data + +pd_data = pd.DataFrame( + { + "ints": [10, 20, 30, 40], + "nested_ints": [[1, 2], [3, 4, 5], [], [20, 30]], + "structs": [{"a": 100}, {}, {"b": 200}, {"b": 300}], + } +) + +pd_data_normalized = pd.DataFrame( + { + "ints": pd.Series([10, 20, 30, 40], dtype=dtypes.INT_DTYPE), + "nested_ints": pd.Series( + [[1, 2], [3, 4, 5], [], [20, 30]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ), + "structs": pd.Series( + [{"a": 100}, {}, {"b": 200}, {"b": 300}], + dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.int64()})), + ), + } +) + + +def test_local_data_well_formed_round_trip(): + local_entry = local_data.ManagedArrowTable.from_pandas(pd_data) + result = pd.DataFrame(local_entry.itertuples(), columns=pd_data.columns) + pandas.testing.assert_frame_equal(pd_data_normalized, result, check_dtype=False)