diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 64a0df9d86..afa13375b1 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1110,13 +1110,22 @@ def summarize( index_columns=[label_col_id], ) - def corr(self): - """Returns a block object to compute the self-correlation on this block.""" + def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): + """ + Returns a block object to compute pairwise metrics among all value columns in this block. + + The metric to be computed is specified by the `op` parameter, which can be either a + correlation operation (default) or a covariance operation. + """ + if len(self.value_columns) > 30: + raise NotImplementedError( + "This function supports dataframes with 30 columns or fewer. " + f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}" + ) + aggregations = [ ( - ex.BinaryAggregation( - agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col) - ), + ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)), f"{left_col}-{right_col}", ) for left_col in self.value_columns diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 051796535b..07dae2c53b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr raise NotImplementedError( f"min_periods not yet supported. {constants.FEEDBACK_LINK}" ) - if len(self.columns) > 30: - raise NotImplementedError( - f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}" - ) if not numeric_only: frame = self._raise_on_non_numeric("corr") else: frame = self._drop_non_numeric() - return DataFrame(frame._block.corr()) + return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp())) + + def cov(self, *, numeric_only: bool = False) -> DataFrame: + if not numeric_only: + frame = self._raise_on_non_numeric("corr") + else: + frame = self._drop_non_numeric() + + return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp())) def to_pandas( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 93bfd8d35c..99ee6680fa 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1916,6 +1916,34 @@ def test_corr_w_invalid_parameters(scalars_dfs): scalars_df[columns].corr(min_periods=1) +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("op"), [ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index f348758c93..d585d4f73e 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -921,6 +921,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): ) +def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "float64_col", "int64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2])) + + bf = scalars_df_index[columns].copy() + bf.columns = multi_columns + + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf.cov(numeric_only=True).to_pandas() + pd_result = pd_df.cov(numeric_only=True) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses string, Pandas uses object. + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("index_names",), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 876a85f1a5..d70d3827e7 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2834,10 +2834,38 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: Include only float, int, boolean, decimal data. Returns: - DataFrame: Correlation matrix. + DataFrame: Correlation matrix. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def cov(self, *, numeric_only) -> DataFrame: + """ + Compute pairwise covariance of columns, excluding NA/null values. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600], + ... 'C': [0.8, 0.4, 0.9]}) + >>> df.cov(numeric_only=True) + A B C + A 1.0 100.0 0.05 + B 100.0 10000.0 5.0 + C 0.05 5.0 0.07 + + [3 rows x 3 columns] + + Args: + numeric_only(bool, default False): + Include only float, int, boolean, decimal data. + + Returns: + DataFrame: The covariance matrix of the series of the DataFrame. + """ + def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None ) -> DataFrame: