From e156f81278d4a1037966ffa20eaca937337cc0dd Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Thu, 13 Jun 2024 19:54:38 +0000 Subject: [PATCH 1/4] fix: df.loc with the 2nd input as bigframes boolean Series --- bigframes/core/indexers.py | 7 ++++++- tests/system/small/test_dataframe.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 582141d539..f21228146a 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -159,7 +159,12 @@ def __getitem__(self, key): ) columns = key[1] - if isinstance(columns, pd.Series) and columns.dtype == "bool": + if isinstance(columns, bigframes.series.Series): + columns = columns.to_pandas() + if isinstance(columns, pd.Series) and columns.dtype in ( + "bool", + "boolean", + ): # TODO(b/340892590): fix type error columns = df.columns[columns] # type: ignore diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 93db0c4196..3c003b1a52 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3020,6 +3020,21 @@ def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_in ) +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.loc[ + :, series.Series(scalars_df_index.dtypes == "Int64") + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.dtypes == "Int64" + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): scalars_df_index = scalars_df_index.set_index("string_col", drop=False) scalars_pandas_df_index = scalars_pandas_df_index.set_index( From cbcaa866d00701048d1258522e431907c4d72d8c Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Thu, 13 Jun 2024 20:17:24 +0000 Subject: [PATCH 2/4] merge main --- bigframes/core/indexers.py | 13 +++---- bigframes/core/indexes/base.py | 5 +++ bigframes/dataframe.py | 19 +++++++-- bigframes/series.py | 5 +++ tests/system/load/test_large_tables.py | 5 +-- .../system/small/operations/test_plotting.py | 6 ++- tests/unit/core/test_indexes.py | 39 +++++++++++++++++++ tests/unit/test_dataframe.py | 14 +++++++ tests/unit/test_series.py | 27 +++++++++++++ 9 files changed, 118 insertions(+), 15 deletions(-) create mode 100644 tests/unit/core/test_indexes.py create mode 100644 tests/unit/test_series.py diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index f21228146a..dae5eada70 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -162,11 +162,10 @@ def __getitem__(self, key): if isinstance(columns, bigframes.series.Series): columns = columns.to_pandas() if isinstance(columns, pd.Series) and columns.dtype in ( - "bool", - "boolean", + bool, + pd.BooleanDtype(), ): - # TODO(b/340892590): fix type error - columns = df.columns[columns] # type: ignore + columns = df.columns[typing.cast(pd.Series, columns)] return df[columns] @@ -257,7 +256,7 @@ def __getitem__(self, key: tuple) -> bigframes.core.scalar.Scalar: raise ValueError(error_message) if len(key) != 2: raise TypeError(error_message) - block: bigframes.core.blocks.Block = self._dataframe._block # type: ignore + block: bigframes.core.blocks.Block = self._dataframe._block column_block = block.select_columns([block.value_columns[key[1]]]) column = bigframes.series.Series(column_block) return column.iloc[key[0]] @@ -381,14 +380,14 @@ def _perform_loc_list_join( ) result = result.rename(original_name) else: - result = series_or_dataframe._perform_join_by_index(keys_index, how="right") # type: ignore + result = series_or_dataframe._perform_join_by_index(keys_index, how="right") if drop_levels and series_or_dataframe.index.nlevels > keys_index.nlevels: # drop common levels levels_to_drop = [ name for name in series_or_dataframe.index.names if name in keys_index.names ] - result = result.droplevel(levels_to_drop) # type: ignore + result = result.droplevel(levels_to_drop) return result diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0e5082447a..8df6155591 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -243,6 +243,11 @@ def query_job(self) -> Optional[bigquery.QueryJob]: return self._query_job def __repr__(self) -> str: + # Protect against errors with uninitialized Series. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + if not hasattr(self, "_block"): + return object.__repr__(self) + # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? # TODO(swast): Avoid downloading the whole series by using job diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f78dee1642..9763b68fef 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -574,9 +574,18 @@ def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame: return DataFrame(block) def __getattr__(self, key: str): + # Protect against recursion errors with uninitialized DataFrame + # objects. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + # and + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html + if key == "_block": + raise AttributeError("_block") + if key in self._block.column_labels: return self.__getitem__(key) - elif hasattr(pandas.DataFrame, key): + + if hasattr(pandas.DataFrame, key): raise AttributeError( textwrap.dedent( f""" @@ -585,8 +594,7 @@ def __getattr__(self, key: str): """ ) ) - else: - raise AttributeError(key) + raise AttributeError(key) def __setattr__(self, key: str, value): if key in ["_block", "_query_job"]: @@ -616,6 +624,11 @@ def __repr__(self) -> str: Only represents the first `bigframes.options.display.max_rows`. """ + # Protect against errors with uninitialized DataFrame. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + if not hasattr(self, "_block"): + return object.__repr__(self) + opts = bigframes.options.display max_results = opts.max_rows if opts.repr_mode == "deferred": diff --git a/bigframes/series.py b/bigframes/series.py index d858060aec..cb56319471 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -281,6 +281,11 @@ def reset_index( return bigframes.dataframe.DataFrame(block) def __repr__(self) -> str: + # Protect against errors with uninitialized Series. See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + if not hasattr(self, "_block"): + return object.__repr__(self) + # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? # TODO(swast): Avoid downloading the whole series by using job diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index f92207b191..472be3d2ad 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -94,8 +94,7 @@ def test_to_pandas_large_table(): # df will be downloaded locally expected_row_count, expected_column_count = df.shape - # TODO(b/340893653): fix type error - df = df.to_pandas() # type: ignore - row_count, column_count = df.shape + df_converted = df.to_pandas() + row_count, column_count = df_converted.shape assert column_count == expected_column_count assert row_count == expected_row_count diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index e0ef84641c..7be44e0a0f 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from matplotlib.collections import PathCollection import numpy as np import pandas as pd import pandas._testing as tm @@ -258,9 +259,10 @@ def test_scatter_args_s(s): ax = df.plot.scatter(x="a", y="b", s="s") pd_ax = pd_df.plot.scatter(x="a", y="b", s="s") - # TODO(b/340891723): fix type error + + assert isinstance(pd_ax.collections[0], PathCollection) tm.assert_numpy_array_equal( - ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() # type: ignore + ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() ) diff --git a/tests/unit/core/test_indexes.py b/tests/unit/core/test_indexes.py new file mode 100644 index 0000000000..6e739c9dc9 --- /dev/null +++ b/tests/unit/core/test_indexes.py @@ -0,0 +1,39 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.indexes + + +def test_index_repr_with_uninitialized_object(): + """Ensures Index.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + index = object.__new__(bigframes.core.indexes.Index) + got = repr(index) + assert "Index" in got + + +def test_multiindex_repr_with_uninitialized_object(): + """Ensures MultiIndex.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + index = object.__new__(bigframes.core.indexes.MultiIndex) + got = repr(index) + assert "MultiIndex" in got diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 17a8290889..6370d1b987 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -15,9 +15,23 @@ import google.cloud.bigquery import pytest +import bigframes.dataframe + from . import resources +def test_dataframe_repr_with_uninitialized_object(): + """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + dataframe = bigframes.dataframe.DataFrame.__new__(bigframes.dataframe.DataFrame) + got = repr(dataframe) + assert "DataFrame" in got + + def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch): dataframe = resources.create_dataframe(monkeypatch) diff --git a/tests/unit/test_series.py b/tests/unit/test_series.py new file mode 100644 index 0000000000..1409209c6c --- /dev/null +++ b/tests/unit/test_series.py @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.series + + +def test_series_repr_with_uninitialized_object(): + """Ensures Series.__init__ can be paused in a visual debugger without crashing. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/728 + """ + # Avoid calling __init__ to simulate pausing __init__ in a debugger. + # https://stackoverflow.com/a/6384982/101923 + series = bigframes.series.Series.__new__(bigframes.series.Series) + got = repr(series) + assert "Series" in got From ff377e28f7f4800874104a9c2dd50d4f6f28ef44 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Thu, 13 Jun 2024 21:20:14 +0000 Subject: [PATCH 3/4] modify test case --- tests/system/small/test_dataframe.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3c003b1a52..39817492ec 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3023,11 +3023,19 @@ def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_in def test_loc_select_with_column_condition_bf_series( scalars_df_index, scalars_pandas_df_index ): + # (b/347072677) GEOGRAPH type doesn't support UNIQUE op + columns = [ + item for item in scalars_pandas_df_index.columns if item != "geography_col" + ] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + size_half = len(scalars_pandas_df_index) / 2 bf_result = scalars_df_index.loc[ - :, series.Series(scalars_df_index.dtypes == "Int64") + :, scalars_df_index.nunique() > size_half ].to_pandas() pd_result = scalars_pandas_df_index.loc[ - :, scalars_pandas_df_index.dtypes == "Int64" + :, scalars_pandas_df_index.nunique() > size_half ] pd.testing.assert_frame_equal( bf_result, From 569c3c1f07cebbc704b1f9c752e8d6816db21add Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Thu, 13 Jun 2024 21:22:32 +0000 Subject: [PATCH 4/4] fix comment --- tests/system/small/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 39817492ec..782ef2d5ea 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3023,7 +3023,7 @@ def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_in def test_loc_select_with_column_condition_bf_series( scalars_df_index, scalars_pandas_df_index ): - # (b/347072677) GEOGRAPH type doesn't support UNIQUE op + # (b/347072677) GEOGRAPH type doesn't support DISTINCT op columns = [ item for item in scalars_pandas_df_index.columns if item != "geography_col" ]