From 032f7004f4dde9ac9fa6403578d107c2a759c057 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Fri, 3 May 2024 23:41:33 +0000 Subject: [PATCH 1/5] fix: fix bug with na in the column labels in stack --- bigframes/core/__init__.py | 6 +++++- tests/system/small/test_multiindex.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 185ce7cd4f..fcb08de8ec 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -429,7 +429,11 @@ def _create_unpivot_labels_array( for row_offset in range(len(former_column_labels)): row_label = former_column_labels[row_offset] row_label = (row_label,) if not isinstance(row_label, tuple) else row_label - row = {col_ids[i]: row_label[i] for i in range(len(col_ids))} + row = { + col_ids[i]: row_label[i] + for i in range(len(col_ids)) + if pandas.notnull(row_label[i]) + } rows.append(row) return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index bb0af52976..a6fd4e3345 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1191,3 +1191,20 @@ def test_explode_w_multi_index(): check_dtype=False, check_index_type=False, ) + + +def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", "c", "d"]) + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index([None, "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + pd_result = pd_df.stack() + bf_result = bf_df.stack().to_pandas() + # Pandas produces NaN, where bq dataframes produces pd.NA + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) From 87cd61880d43bc36911f17a85ef257510c319cdd Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Mon, 6 May 2024 16:36:02 +0000 Subject: [PATCH 2/5] use python None instead --- bigframes/core/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index fcb08de8ec..89049713df 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -430,9 +430,8 @@ def _create_unpivot_labels_array( row_label = former_column_labels[row_offset] row_label = (row_label,) if not isinstance(row_label, tuple) else row_label row = { - col_ids[i]: row_label[i] + col_ids[i]: row_label[i] if pandas.notnull(row_label[i]) else None for i in range(len(col_ids)) - if pandas.notnull(row_label[i]) } rows.append(row) From be8c8ff6748ec9f9ebf86e543d445615cb12d289 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Mon, 6 May 2024 16:47:08 +0000 Subject: [PATCH 3/5] add parens for clarity --- bigframes/core/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 89049713df..eef0efcf83 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -430,7 +430,7 @@ def _create_unpivot_labels_array( row_label = former_column_labels[row_offset] row_label = (row_label,) if not isinstance(row_label, tuple) else row_label row = { - col_ids[i]: row_label[i] if pandas.notnull(row_label[i]) else None + col_ids[i]: (row_label[i] if pandas.notnull(row_label[i]) else None) for i in range(len(col_ids)) } rows.append(row) From 70c5d2246f22e94edea2ae4361d8a45d289a1ff4 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Mon, 6 May 2024 18:16:00 +0000 Subject: [PATCH 4/5] allow NA to match nan --- tests/system/small/test_multiindex.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a6fd4e3345..b0c4109758 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1206,5 +1206,7 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index pd_result = pd_df.stack() bf_result = bf_df.stack().to_pandas() + # Pandas produces NaN, where bq dataframes produces pd.NA + pd_result["c"] = pd_result["c"].replace(pandas.NA, np.nan) pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) From c06f49222138c6a6c550ecbb9c634f7b3116b7a0 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Mon, 6 May 2024 18:17:39 +0000 Subject: [PATCH 5/5] fix comment --- tests/system/small/test_multiindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b0c4109758..613ad945c1 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1207,6 +1207,6 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index pd_result = pd_df.stack() bf_result = bf_df.stack().to_pandas() - # Pandas produces NaN, where bq dataframes produces pd.NA + # Pandas produces pd.NA, where bq dataframes produces NaN pd_result["c"] = pd_result["c"].replace(pandas.NA, np.nan) pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)