From 6261df8c4da9fe2b180194666eeecc823831f104 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 20 Feb 2025 22:55:37 +0000 Subject: [PATCH 01/15] [WIP] Implement date_diff, date_add, and date_sub. Some tests are broken. --- bigframes/core/compile/scalar_op_compiler.py | 15 ++++ bigframes/core/rewrite/timedeltas.py | 14 ++++ bigframes/operations/__init__.py | 6 ++ bigframes/operations/date_ops.py | 23 ++++++ bigframes/operations/numeric_ops.py | 14 +++- bigframes/operations/timedelta_ops.py | 47 +++++++++++ tests/system/small/operations/test_dates.py | 61 ++++++++++++++ .../small/operations/test_timedeltas.py | 81 ++++++++++++++++++- 8 files changed, 258 insertions(+), 3 deletions(-) create mode 100644 tests/system/small/operations/test_dates.py diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 923ec8c81d..38fbcfdef7 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -752,6 +752,21 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal return x - y.to_interval("us") +@scalar_op_compiler.register_binary_op(ops.date_diff_op) +def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): + return (x.delta(y, "day") * UNIT_TO_US_CONVERSION_FACTORS["d"]).floor() + + +@scalar_op_compiler.register_binary_op(ops.date_add_op) +def date_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x + (y // UNIT_TO_US_CONVERSION_FACTORS["d"]).to_interval("day") + + +@scalar_op_compiler.register_binary_op(ops.date_sub_op) +def date_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x - (y // UNIT_TO_US_CONVERSION_FACTORS["d"]).to_interval("day") + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index dad474e5a1..2113bbdf1c 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -135,6 +135,12 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.timestamp_sub_op, left, right) + if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.DATE_DTYPE: + return _TypedExpr.create_op_expr(ops.date_diff_op, left, right) + + if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.date_sub_op, left, right) + return _TypedExpr.create_op_expr(ops.sub_op, left, right) @@ -147,6 +153,14 @@ def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: # always on the right. return _TypedExpr.create_op_expr(ops.timestamp_add_op, right, left) + if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.date_add_op, left, right) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.DATE_DTYPE: + # Re-arrange operands such that date is always on the left and timedelta is + # always on the right. + return _TypedExpr.create_op_expr(ops.date_add_op, right, left) + return _TypedExpr.create_op_expr(ops.add_op, left, right) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e4e4bf7ef3..34c4e551f8 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -39,6 +39,7 @@ ne_op, ) from bigframes.operations.date_ops import ( + date_diff_op, day_op, dayofweek_op, month_op, @@ -184,6 +185,8 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + date_add_op, + date_sub_op, timestamp_add_op, timestamp_sub_op, ToTimedeltaOp, @@ -248,6 +251,7 @@ "upper_op", "ZfillOp", # Date ops + "date_diff_op", "day_op", "month_op", "year_op", @@ -259,6 +263,8 @@ "second_op", "normalize_op", # Timedelta ops + "date_add_op", + "date_sub_op", "timestamp_add_op", "timestamp_sub_op", "ToTimedeltaOp", diff --git a/bigframes/operations/date_ops.py b/bigframes/operations/date_ops.py index 2b68a24caf..32d8eec118 100644 --- a/bigframes/operations/date_ops.py +++ b/bigframes/operations/date_ops.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses +import typing + +from bigframes import dtypes from bigframes.operations import base_ops import bigframes.operations.type as op_typing @@ -39,3 +43,22 @@ name="quarter", type_signature=op_typing.DATELIKE_ACCESSOR, ) + + +@dataclasses.dataclass(frozen=True) +class DateDiffOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_diff" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not input_types[1]: + raise TypeError( + f"two inputs have different types. left: {input_types[0]}, right: {input_types[1]}" + ) + + if input_types[0] != dtypes.DATE_DTYPE: + raise TypeError("expected date input") + + return dtypes.TIMEDELTA_DTYPE + + +date_diff_op = DateDiffOp() diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index f5a290bde5..453dad4335 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -123,12 +123,18 @@ def output_type(self, *input_types): # String addition return input_types[0] - # Timestamp addition. + # Temporal addition. if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): return right_type + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return left_type + + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.DATE_DTYPE: + return right_type + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE @@ -155,9 +161,15 @@ def output_type(self, *input_types): if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): return dtypes.TIMEDELTA_DTYPE + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.DATE_DTYPE: + return dtypes.TIMEDELTA_DTYPE + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return left_type + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 689966e21b..8fb52bce48 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -60,6 +60,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_add_op = TimestampAdd() +@dataclasses.dataclass(frozen=True) class TimestampSub(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" @@ -77,3 +78,49 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_sub_op = TimestampSub() + + +@dataclasses.dataclass(frozen=True) +class DateAddOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_add" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # date + timedelta => date + if ( + input_types[0] == dtypes.DATE_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE + ): + return dtypes.DATE_DTYPE + # timedelta + date => date + if ( + input_types[0] == dtypes.TIMEDELTA_DTYPE + and input_types[1] == dtypes.DATE_DTYPE + ): + return dtypes.DATE_DTYPE + + raise TypeError( + f"unsupported types for date_add. left: {input_types[0]} right: {input_types[1]}" + ) + + +date_add_op = DateAddOp() + + +@dataclasses.dataclass(frozen=True) +class DateSubOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_sub" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # date - timedelta => date + if ( + input_types[0] == dtypes.DATE_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE + ): + return dtypes.DATE_DTYPE + + raise TypeError( + f"unsupported types for date_sub. left: {input_types[0]} right: {input_types[1]}" + ) + + +date_sub_op = DateSubOp() diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py new file mode 100644 index 0000000000..0084aa69e7 --- /dev/null +++ b/tests/system/small/operations/test_dates.py @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import pandas as pd +import pandas.testing + +from bigframes import dtypes + + +def test_date_diff_between_series(session): + pd_df = pd.DataFrame( + { + "col_1": [datetime.date(2025, 1, 2), datetime.date(2025, 2, 1)], + "col_2": [datetime.date(2024, 1, 2), datetime.date(2026, 1, 30)], + } + ) + bf_df = session.read_pandas(pd_df) + + actual_result = (bf_df["col_1"] - bf_df["col_2"]).to_pandas() + + expected_result = (pd_df["col_1"] - pd_df["col_2"]).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_diff_literal_sub_series(scalars_dfs): + bf_df, pd_df = scalars_dfs + literal = datetime.date(2030, 5, 20) + + actual_result = (literal - bf_df["date_col"]).to_pandas() + + expected_result = (literal - pd_df["date_col"]).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_diff_series_sub_literal(scalars_dfs): + bf_df, pd_df = scalars_dfs + literal = datetime.date(1980, 5, 20) + + actual_result = (bf_df["date_col"] - literal).to_pandas() + + expected_result = (pd_df["date_col"] - literal).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 356000b3f6..cf8ffe7bc9 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -19,6 +19,7 @@ import numpy as np import pandas as pd import pandas.testing +import pyarrow as pa import pytest from bigframes import dtypes @@ -38,14 +39,19 @@ def temporal_dfs(session): pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), ], + "date_col": [ + datetime.date(2000, 1, 1), + datetime.date(2001, 2, 3), + datetime.date(2020, 9, 30), + ], "timedelta_col_1": [ pd.Timedelta(5, "s"), - pd.Timedelta(-4, "d"), + pd.Timedelta(-4, "m"), pd.Timedelta(5, "h"), ], "timedelta_col_2": [ pd.Timedelta(3, "s"), - pd.Timedelta(-4, "d"), + pd.Timedelta(-4, "m"), pd.Timedelta(6, "h"), ], "numeric_col": [1.5, 2, -3], @@ -365,6 +371,77 @@ def test_timestamp_sub_dataframes(temporal_dfs): ) +@pytest.mark.parametrize( + ("left_col", "right_col"), + [ + ("date_col", "timedelta_col_1"), + ("timedelta_col_1", "date_col"), + ], +) +def test_date_add__series_add_series(temporal_dfs, left_col, right_col): + bf_df, pd_df = temporal_dfs + + actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() + + expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATE_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +# Pandas does not support datetime.date literal + timedelta series +# so we don't test it here either. +def test_date_add__literal_add_series(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (literal + bf_df["date_col"]).to_pandas() + + expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATE_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +# Pandas does not support timedelta series + datetime.date literal +# so we don't test it here either. +def test_date_add__series_add_literal(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (bf_df["date_col"] + literal).to_pandas() + + expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATE_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_sub__series_sub_series(temporal_dfs): + bf_df, pd_df = temporal_dfs + + actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() + + expected_result = (pd_df["date_col"] - pd_df["timedelta_col_1"]).astype( + dtypes.DATE_DTYPE + ) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_sub__series_sub_literal(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (bf_df["date_col"] - literal).to_pandas() + + expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATE_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + @pytest.mark.parametrize( "compare_func", [ From b50e998744269e435b3adab145ea04ee20659155 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 20 Feb 2025 23:02:00 +0000 Subject: [PATCH 02/15] fix format --- bigframes/operations/timedelta_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 05c7ee1e3b..8e386a7dac 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -95,6 +95,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT f"unsupported types for timestamp_sub. left: {input_types[0]} right: {input_types[1]}" ) + timestamp_sub_op = TimestampSubOp() From 2c6c5e23b6defb5567b159e01277c59a47b55df6 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 21 Feb 2025 18:54:21 +0000 Subject: [PATCH 03/15] change implementation to emulate PyArrow --- bigframes/core/compile/scalar_op_compiler.py | 8 ++--- bigframes/operations/numeric_ops.py | 6 ++-- bigframes/operations/timedelta_ops.py | 12 ++++---- .../small/operations/test_timedeltas.py | 29 ++++++++++--------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b7d1c26d35..175461d53e 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -758,13 +758,13 @@ def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): @scalar_op_compiler.register_binary_op(ops.date_add_op) -def date_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): - return x + (y // UNIT_TO_US_CONVERSION_FACTORS["d"]).to_interval("day") +def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast("timestamp") + y.to_interval("us") @scalar_op_compiler.register_binary_op(ops.date_sub_op) -def date_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): - return x - (y // UNIT_TO_US_CONVERSION_FACTORS["d"]).to_interval("day") +def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast("timestamp") - y.to_interval("us") @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 453dad4335..ae23aff707 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -130,10 +130,10 @@ def output_type(self, *input_types): return right_type if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: - return left_type + return dtypes.DATETIME_DTYPE if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.DATE_DTYPE: - return right_type + return dtypes.DATETIME_DTYPE if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE @@ -168,7 +168,7 @@ def output_type(self, *input_types): return left_type if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: - return left_type + return dtypes.DATETIME_DTYPE if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 8e386a7dac..b831e3f864 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -104,18 +104,18 @@ class DateAddOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "date_add" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - # date + timedelta => date + # date + timedelta => timestamp without timezone if ( input_types[0] == dtypes.DATE_DTYPE and input_types[1] == dtypes.TIMEDELTA_DTYPE ): - return dtypes.DATE_DTYPE - # timedelta + date => date + return dtypes.DATETIME_DTYPE + # timedelta + date => timestamp without timezone if ( input_types[0] == dtypes.TIMEDELTA_DTYPE and input_types[1] == dtypes.DATE_DTYPE ): - return dtypes.DATE_DTYPE + return dtypes.DATETIME_DTYPE raise TypeError( f"unsupported types for date_add. left: {input_types[0]} right: {input_types[1]}" @@ -130,12 +130,12 @@ class DateSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "date_sub" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - # date - timedelta => date + # date - timedelta => timestamp without timezone if ( input_types[0] == dtypes.DATE_DTYPE and input_types[1] == dtypes.TIMEDELTA_DTYPE ): - return dtypes.DATE_DTYPE + return dtypes.DATETIME_DTYPE raise TypeError( f"unsupported types for date_sub. left: {input_types[0]} right: {input_types[1]}" diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index cf8ffe7bc9..5f098f02af 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -39,11 +39,14 @@ def temporal_dfs(session): pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), ], - "date_col": [ - datetime.date(2000, 1, 1), - datetime.date(2001, 2, 3), - datetime.date(2020, 9, 30), - ], + "date_col": pd.Series( + [ + datetime.date(2000, 1, 1), + datetime.date(2001, 2, 3), + datetime.date(2020, 9, 30), + ], + dtype=pd.ArrowDtype(pa.date32()), + ), "timedelta_col_1": [ pd.Timedelta(5, "s"), pd.Timedelta(-4, "m"), @@ -383,35 +386,33 @@ def test_date_add__series_add_series(temporal_dfs, left_col, right_col): actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() - expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATE_DTYPE) + expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) -# Pandas does not support datetime.date literal + timedelta series -# so we don't test it here either. +# Pandas does not support date literal + timedelta series so we don't test it here. def test_date_add__literal_add_series(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") actual_result = (literal + bf_df["date_col"]).to_pandas() - expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATE_DTYPE) + expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) -# Pandas does not support timedelta series + datetime.date literal -# so we don't test it here either. +# Pandas does not support timedelta series + date literal so we don't test it here. def test_date_add__series_add_literal(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") actual_result = (bf_df["date_col"] + literal).to_pandas() - expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATE_DTYPE) + expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -423,7 +424,7 @@ def test_date_sub__series_sub_series(temporal_dfs): actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() expected_result = (pd_df["date_col"] - pd_df["timedelta_col_1"]).astype( - dtypes.DATE_DTYPE + dtypes.DATETIME_DTYPE ) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False @@ -436,7 +437,7 @@ def test_date_sub__series_sub_literal(temporal_dfs): actual_result = (bf_df["date_col"] - literal).to_pandas() - expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATE_DTYPE) + expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) From 34f344b597e1596c57399d59af4fba4c563ed8c9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 21 Feb 2025 19:05:59 +0000 Subject: [PATCH 04/15] fix mypy --- bigframes/core/compile/scalar_op_compiler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 175461d53e..3fa2246ccf 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -754,17 +754,17 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal @scalar_op_compiler.register_binary_op(ops.date_diff_op) def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): - return (x.delta(y, "day") * UNIT_TO_US_CONVERSION_FACTORS["d"]).floor() + return (x.delta(y, "day") * UNIT_TO_US_CONVERSION_FACTORS["d"]).floor() # type: ignore @scalar_op_compiler.register_binary_op(ops.date_add_op) def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast("timestamp") + y.to_interval("us") + return x.cast("timestamp") + y.to_interval("us") # type: ignore @scalar_op_compiler.register_binary_op(ops.date_sub_op) def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast("timestamp") - y.to_interval("us") + return x.cast("timestamp") - y.to_interval("us") # type: ignore @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) From 72fa2e8f2d5cf2c56a13eaadc538f87ef73b60c9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 21 Feb 2025 19:23:45 +0000 Subject: [PATCH 05/15] fix format --- bigframes/core/compile/scalar_op_compiler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 3fa2246ccf..267a5385d4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -754,17 +754,17 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal @scalar_op_compiler.register_binary_op(ops.date_diff_op) def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): - return (x.delta(y, "day") * UNIT_TO_US_CONVERSION_FACTORS["d"]).floor() # type: ignore + return (x.delta(y, "day") * UNIT_TO_US_CONVERSION_FACTORS["d"]).floor() # type: ignore @scalar_op_compiler.register_binary_op(ops.date_add_op) def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast("timestamp") + y.to_interval("us") # type: ignore + return x.cast("timestamp") + y.to_interval("us") # type: ignore @scalar_op_compiler.register_binary_op(ops.date_sub_op) def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast("timestamp") - y.to_interval("us") # type: ignore + return x.cast("timestamp") - y.to_interval("us") # type: ignore @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) From 59cb5ec2268f8acb7af9112dd9cbe25c1b72db59 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 21 Feb 2025 21:06:25 +0000 Subject: [PATCH 06/15] fix tests --- tests/system/small/operations/test_dates.py | 2 +- .../small/operations/test_timedeltas.py | 26 ++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py index 0084aa69e7..01cbe69f64 100644 --- a/tests/system/small/operations/test_dates.py +++ b/tests/system/small/operations/test_dates.py @@ -26,7 +26,7 @@ def test_date_diff_between_series(session): "col_1": [datetime.date(2025, 1, 2), datetime.date(2025, 2, 1)], "col_2": [datetime.date(2024, 1, 2), datetime.date(2026, 1, 30)], } - ) + ).astype(dtypes.DATE_DTYPE) bf_df = session.read_pandas(pd_df) actual_result = (bf_df["col_1"] - bf_df["col_2"]).to_pandas() diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 5f098f02af..535fed5a24 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -47,16 +47,22 @@ def temporal_dfs(session): ], dtype=pd.ArrowDtype(pa.date32()), ), - "timedelta_col_1": [ - pd.Timedelta(5, "s"), - pd.Timedelta(-4, "m"), - pd.Timedelta(5, "h"), - ], - "timedelta_col_2": [ - pd.Timedelta(3, "s"), - pd.Timedelta(-4, "m"), - pd.Timedelta(6, "h"), - ], + "timedelta_col_1": pd.Series( + [ + pd.Timedelta(5, "s"), + pd.Timedelta(-4, "m"), + pd.Timedelta(5, "h"), + ], + dtype=dtypes.TIMEDELTA_DTYPE, + ), + "timedelta_col_2": pd.Series( + [ + pd.Timedelta(3, "s"), + pd.Timedelta(-4, "m"), + pd.Timedelta(6, "h"), + ], + dtype=dtypes.TIMEDELTA_DTYPE, + ), "numeric_col": [1.5, 2, -3], } ) From 558b7dd701079bc21d41ee9f24736ba12d6073ad Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 21 Feb 2025 23:09:49 +0000 Subject: [PATCH 07/15] fix more tests --- .../small/operations/test_timedeltas.py | 62 ++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 535fed5a24..6a1ffa1638 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -47,22 +47,16 @@ def temporal_dfs(session): ], dtype=pd.ArrowDtype(pa.date32()), ), - "timedelta_col_1": pd.Series( - [ - pd.Timedelta(5, "s"), - pd.Timedelta(-4, "m"), - pd.Timedelta(5, "h"), - ], - dtype=dtypes.TIMEDELTA_DTYPE, - ), - "timedelta_col_2": pd.Series( - [ - pd.Timedelta(3, "s"), - pd.Timedelta(-4, "m"), - pd.Timedelta(6, "h"), - ], - dtype=dtypes.TIMEDELTA_DTYPE, - ), + "timedelta_col_1": [ + pd.Timedelta(5, "s"), + pd.Timedelta(-4, "m"), + pd.Timedelta(5, "h"), + ], + "timedelta_col_2": [ + pd.Timedelta(3, "s"), + pd.Timedelta(-4, "m"), + pd.Timedelta(6, "h"), + ], "numeric_col": [1.5, 2, -3], } ) @@ -390,9 +384,13 @@ def test_timestamp_sub_dataframes(temporal_dfs): def test_date_add__series_add_series(temporal_dfs, left_col, right_col): bf_df, pd_df = temporal_dfs - actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() + actual_result = ( + (bf_df[left_col] + bf_df[right_col]) + .to_pandas() + .astype("timestamp[ns][pyarrow]") + ) - expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATETIME_DTYPE) + expected_result = pd_df[left_col] + pd_df[right_col] pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -403,9 +401,11 @@ def test_date_add__literal_add_series(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") - actual_result = (literal + bf_df["date_col"]).to_pandas() + actual_result = ( + (literal + bf_df["date_col"]).to_pandas().astype("timestamp[ns][pyarrow]") + ) - expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATETIME_DTYPE) + expected_result = literal + pd_df["date_col"] pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -416,9 +416,11 @@ def test_date_add__series_add_literal(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") - actual_result = (bf_df["date_col"] + literal).to_pandas() + actual_result = ( + (bf_df["date_col"] + literal).to_pandas().astype("timestamp[ns][pyarrow]") + ) - expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATETIME_DTYPE) + expected_result = pd_df["date_col"] + literal pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -427,11 +429,13 @@ def test_date_add__series_add_literal(temporal_dfs): def test_date_sub__series_sub_series(temporal_dfs): bf_df, pd_df = temporal_dfs - actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() - - expected_result = (pd_df["date_col"] - pd_df["timedelta_col_1"]).astype( - dtypes.DATETIME_DTYPE + actual_result = ( + (bf_df["date_col"] - bf_df["timedelta_col_1"]) + .to_pandas() + .astype("timestamp[ns][pyarrow]") ) + + expected_result = pd_df["date_col"] - pd_df["timedelta_col_1"] pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -441,9 +445,11 @@ def test_date_sub__series_sub_literal(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") - actual_result = (bf_df["date_col"] - literal).to_pandas() + actual_result = ( + (bf_df["date_col"] - literal).to_pandas().astype("timestamp[ns][pyarrow]") + ) - expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATETIME_DTYPE) + expected_result = pd_df["date_col"] - literal pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) From 0113cd1baa73b6d3b0acf8b250c7e47c9cc7a741 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 19:13:52 +0000 Subject: [PATCH 08/15] bump pyarrow dependency to 15.0.2 --- setup.py | 2 +- testing/constraints-3.9.txt | 2 +- .../small/operations/test_timedeltas.py | 36 +++++++------------ 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 1f6114b634..9ea563b3cb 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ "numpy >=1.24.0", "pandas >=1.5.3", "pandas-gbq >=0.26.0", - "pyarrow >=10.0.1", + "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "sqlglot >=23.6.3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 30d5c1c3a7..b355e0915b 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -16,7 +16,7 @@ jellyfish==0.8.9 numpy==1.24.0 pandas==1.5.3 pandas-gbq==0.26.0 -pyarrow==10.0.1 +pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 6a1ffa1638..5f098f02af 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -384,13 +384,9 @@ def test_timestamp_sub_dataframes(temporal_dfs): def test_date_add__series_add_series(temporal_dfs, left_col, right_col): bf_df, pd_df = temporal_dfs - actual_result = ( - (bf_df[left_col] + bf_df[right_col]) - .to_pandas() - .astype("timestamp[ns][pyarrow]") - ) + actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() - expected_result = pd_df[left_col] + pd_df[right_col] + expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -401,11 +397,9 @@ def test_date_add__literal_add_series(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") - actual_result = ( - (literal + bf_df["date_col"]).to_pandas().astype("timestamp[ns][pyarrow]") - ) + actual_result = (literal + bf_df["date_col"]).to_pandas() - expected_result = literal + pd_df["date_col"] + expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -416,11 +410,9 @@ def test_date_add__series_add_literal(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") - actual_result = ( - (bf_df["date_col"] + literal).to_pandas().astype("timestamp[ns][pyarrow]") - ) + actual_result = (bf_df["date_col"] + literal).to_pandas() - expected_result = pd_df["date_col"] + literal + expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -429,13 +421,11 @@ def test_date_add__series_add_literal(temporal_dfs): def test_date_sub__series_sub_series(temporal_dfs): bf_df, pd_df = temporal_dfs - actual_result = ( - (bf_df["date_col"] - bf_df["timedelta_col_1"]) - .to_pandas() - .astype("timestamp[ns][pyarrow]") - ) + actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() - expected_result = pd_df["date_col"] - pd_df["timedelta_col_1"] + expected_result = (pd_df["date_col"] - pd_df["timedelta_col_1"]).astype( + dtypes.DATETIME_DTYPE + ) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -445,11 +435,9 @@ def test_date_sub__series_sub_literal(temporal_dfs): bf_df, pd_df = temporal_dfs literal = pd.Timedelta(1, "d") - actual_result = ( - (bf_df["date_col"] - literal).to_pandas().astype("timestamp[ns][pyarrow]") - ) + actual_result = (bf_df["date_col"] - literal).to_pandas() - expected_result = pd_df["date_col"] - literal + expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATETIME_DTYPE) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) From 6bce096e0b4351b2f5e4ee6837a523cdc60dfd77 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 22:36:19 +0000 Subject: [PATCH 09/15] raise pandas version in system-3.12 tests --- testing/constraints-3.12.txt | 1 + tests/system/small/operations/test_timedeltas.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/testing/constraints-3.12.txt b/testing/constraints-3.12.txt index e69de29bb2..521fb5dc6a 100644 --- a/testing/constraints-3.12.txt +++ b/testing/constraints-3.12.txt @@ -0,0 +1 @@ +pandas==2.1.0 \ No newline at end of file diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 5f098f02af..238e4a003f 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -23,6 +23,7 @@ import pytest from bigframes import dtypes +from packaging import version @pytest.fixture(scope="module") @@ -382,6 +383,9 @@ def test_timestamp_sub_dataframes(temporal_dfs): ], ) def test_date_add__series_add_series(temporal_dfs, left_col, right_col): + if version.Version(pd.__version__) < version.Version("2.1.0"): + pytest.skip("not supported by Pandas < 2.1.0") + bf_df, pd_df = temporal_dfs actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() @@ -419,6 +423,9 @@ def test_date_add__series_add_literal(temporal_dfs): def test_date_sub__series_sub_series(temporal_dfs): + if version.Version(pd.__version__) < version.Version("2.1.0"): + pytest.skip("not supported by Pandas < 2.1.0") + bf_df, pd_df = temporal_dfs actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() From 281f4636365bd075e7cf914cee8bd4c94c98a70d Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 22:39:21 +0000 Subject: [PATCH 10/15] fix format --- tests/system/small/operations/test_timedeltas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 238e4a003f..c1e373a4ab 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -17,13 +17,13 @@ import operator import numpy as np +from packaging import version import pandas as pd import pandas.testing import pyarrow as pa import pytest from bigframes import dtypes -from packaging import version @pytest.fixture(scope="module") @@ -425,7 +425,7 @@ def test_date_add__series_add_literal(temporal_dfs): def test_date_sub__series_sub_series(temporal_dfs): if version.Version(pd.__version__) < version.Version("2.1.0"): pytest.skip("not supported by Pandas < 2.1.0") - + bf_df, pd_df = temporal_dfs actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() From 45a4706a1923884ec5aa6609ac4f2aa2524ec242 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 22:44:30 +0000 Subject: [PATCH 11/15] restore constraints-3.12 --- testing/constraints-3.12.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/testing/constraints-3.12.txt b/testing/constraints-3.12.txt index 521fb5dc6a..e69de29bb2 100644 --- a/testing/constraints-3.12.txt +++ b/testing/constraints-3.12.txt @@ -1 +0,0 @@ -pandas==2.1.0 \ No newline at end of file From 3868eb7a9955099555ba716de660eea8e77acf5b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 23:45:13 +0000 Subject: [PATCH 12/15] raise pandas version to 2.2.0 in 3.12 testing env --- testing/constraints-3.12.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/testing/constraints-3.12.txt b/testing/constraints-3.12.txt index e69de29bb2..3105b202df 100644 --- a/testing/constraints-3.12.txt +++ b/testing/constraints-3.12.txt @@ -0,0 +1 @@ +pandas==2.2.0 \ No newline at end of file From 90c7d92fc429160fcd4b57b214706caa2bdda0ca Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 25 Feb 2025 18:13:38 +0000 Subject: [PATCH 13/15] remove pandas constraints for 3.12 --- testing/constraints-3.12.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/testing/constraints-3.12.txt b/testing/constraints-3.12.txt index 3105b202df..e69de29bb2 100644 --- a/testing/constraints-3.12.txt +++ b/testing/constraints-3.12.txt @@ -1 +0,0 @@ -pandas==2.2.0 \ No newline at end of file From d7f2dbd3e6453dee10e7d62e10ea68f9e0ceaa59 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 25 Feb 2025 23:11:56 +0000 Subject: [PATCH 14/15] fix merge error --- tests/system/small/operations/test_dates.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py index 799886edbc..e183bbfe43 100644 --- a/tests/system/small/operations/test_dates.py +++ b/tests/system/small/operations/test_dates.py @@ -57,6 +57,9 @@ def test_date_diff_series_sub_literal(scalars_dfs): actual_result = (bf_df["date_col"] - literal).to_pandas() expected_result = (pd_df["date_col"] - literal).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) def test_date_series_diff_agg(scalars_dfs): From f86b5f2ff67df492107a60181e25ef828120cb9f Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 26 Feb 2025 20:40:00 +0000 Subject: [PATCH 15/15] cast factor to int instead of calling floor() --- bigframes/core/compile/scalar_op_compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index cf556909d1..6e9b961971 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -742,7 +742,7 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal @scalar_op_compiler.register_binary_op(ops.date_diff_op) def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): - return (x.delta(y, "day") * UNIT_TO_US_CONVERSION_FACTORS["d"]).floor() # type: ignore + return x.delta(y, "day") * int(UNIT_TO_US_CONVERSION_FACTORS["d"]) # type: ignore @scalar_op_compiler.register_binary_op(ops.date_add_op)