diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 9af6a5c0b9..6e9b961971 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -740,6 +740,21 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal return x - y.to_interval("us") +@scalar_op_compiler.register_binary_op(ops.date_diff_op) +def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): + return x.delta(y, "day") * int(UNIT_TO_US_CONVERSION_FACTORS["d"]) # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.date_add_op) +def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast("timestamp") + y.to_interval("us") # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.date_sub_op) +def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast("timestamp") - y.to_interval("us") # type: ignore + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index 345a57ab89..bf3c0ee639 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -151,6 +151,12 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: return _TypedExpr.create_op_expr(ops.timestamp_sub_op, left, right) + if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.DATE_DTYPE: + return _TypedExpr.create_op_expr(ops.date_diff_op, left, right) + + if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.date_sub_op, left, right) + return _TypedExpr.create_op_expr(ops.sub_op, left, right) @@ -163,6 +169,14 @@ def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: # always on the right. return _TypedExpr.create_op_expr(ops.timestamp_add_op, right, left) + if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.date_add_op, left, right) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.DATE_DTYPE: + # Re-arrange operands such that date is always on the left and timedelta is + # always on the right. + return _TypedExpr.create_op_expr(ops.date_add_op, right, left) + return _TypedExpr.create_op_expr(ops.add_op, left, right) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 7e6f1f793c..7128c10bb2 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -39,6 +39,7 @@ ne_op, ) from bigframes.operations.date_ops import ( + date_diff_op, day_op, dayofweek_op, month_op, @@ -184,6 +185,8 @@ from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op from bigframes.operations.timedelta_ops import ( + date_add_op, + date_sub_op, timedelta_floor_op, timestamp_add_op, timestamp_sub_op, @@ -249,6 +252,7 @@ "upper_op", "ZfillOp", # Date ops + "date_diff_op", "day_op", "month_op", "year_op", @@ -260,6 +264,8 @@ "second_op", "normalize_op", # Timedelta ops + "date_add_op", + "date_sub_op", "timedelta_floor_op", "timestamp_add_op", "timestamp_sub_op", diff --git a/bigframes/operations/date_ops.py b/bigframes/operations/date_ops.py index 2b68a24caf..32d8eec118 100644 --- a/bigframes/operations/date_ops.py +++ b/bigframes/operations/date_ops.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses +import typing + +from bigframes import dtypes from bigframes.operations import base_ops import bigframes.operations.type as op_typing @@ -39,3 +43,22 @@ name="quarter", type_signature=op_typing.DATELIKE_ACCESSOR, ) + + +@dataclasses.dataclass(frozen=True) +class DateDiffOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_diff" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not input_types[1]: + raise TypeError( + f"two inputs have different types. left: {input_types[0]}, right: {input_types[1]}" + ) + + if input_types[0] != dtypes.DATE_DTYPE: + raise TypeError("expected date input") + + return dtypes.TIMEDELTA_DTYPE + + +date_diff_op = DateDiffOp() diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index f5a290bde5..ae23aff707 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -123,12 +123,18 @@ def output_type(self, *input_types): # String addition return input_types[0] - # Timestamp addition. + # Temporal addition. if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): return right_type + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return dtypes.DATETIME_DTYPE + + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.DATE_DTYPE: + return dtypes.DATETIME_DTYPE + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE @@ -155,9 +161,15 @@ def output_type(self, *input_types): if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): return dtypes.TIMEDELTA_DTYPE + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.DATE_DTYPE: + return dtypes.TIMEDELTA_DTYPE + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: return left_type + if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return dtypes.DATETIME_DTYPE + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 364154f728..b831e3f864 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -79,6 +79,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_add_op = TimestampAddOp() +@dataclasses.dataclass(frozen=True) class TimestampSubOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "timestamp_sub" @@ -96,3 +97,49 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_sub_op = TimestampSubOp() + + +@dataclasses.dataclass(frozen=True) +class DateAddOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_add" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # date + timedelta => timestamp without timezone + if ( + input_types[0] == dtypes.DATE_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE + ): + return dtypes.DATETIME_DTYPE + # timedelta + date => timestamp without timezone + if ( + input_types[0] == dtypes.TIMEDELTA_DTYPE + and input_types[1] == dtypes.DATE_DTYPE + ): + return dtypes.DATETIME_DTYPE + + raise TypeError( + f"unsupported types for date_add. left: {input_types[0]} right: {input_types[1]}" + ) + + +date_add_op = DateAddOp() + + +@dataclasses.dataclass(frozen=True) +class DateSubOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "date_sub" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # date - timedelta => timestamp without timezone + if ( + input_types[0] == dtypes.DATE_DTYPE + and input_types[1] == dtypes.TIMEDELTA_DTYPE + ): + return dtypes.DATETIME_DTYPE + + raise TypeError( + f"unsupported types for date_sub. left: {input_types[0]} right: {input_types[1]}" + ) + + +date_sub_op = DateSubOp() diff --git a/setup.py b/setup.py index 1f6114b634..9ea563b3cb 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ "numpy >=1.24.0", "pandas >=1.5.3", "pandas-gbq >=0.26.0", - "pyarrow >=10.0.1", + "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "sqlglot >=23.6.3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 30d5c1c3a7..b355e0915b 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -16,7 +16,7 @@ jellyfish==0.8.9 numpy==1.24.0 pandas==1.5.3 pandas-gbq==0.26.0 -pyarrow==10.0.1 +pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py index f957879d8b..e183bbfe43 100644 --- a/tests/system/small/operations/test_dates.py +++ b/tests/system/small/operations/test_dates.py @@ -12,11 +12,56 @@ # See the License for the specific language governing permissions and # limitations under the License. + +import datetime + +import pandas as pd import pandas.testing from bigframes import dtypes +def test_date_diff_between_series(session): + pd_df = pd.DataFrame( + { + "col_1": [datetime.date(2025, 1, 2), datetime.date(2025, 2, 1)], + "col_2": [datetime.date(2024, 1, 2), datetime.date(2026, 1, 30)], + } + ).astype(dtypes.DATE_DTYPE) + bf_df = session.read_pandas(pd_df) + + actual_result = (bf_df["col_1"] - bf_df["col_2"]).to_pandas() + + expected_result = (pd_df["col_1"] - pd_df["col_2"]).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_diff_literal_sub_series(scalars_dfs): + bf_df, pd_df = scalars_dfs + literal = datetime.date(2030, 5, 20) + + actual_result = (literal - bf_df["date_col"]).to_pandas() + + expected_result = (literal - pd_df["date_col"]).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_diff_series_sub_literal(scalars_dfs): + bf_df, pd_df = scalars_dfs + literal = datetime.date(1980, 5, 20) + + actual_result = (bf_df["date_col"] - literal).to_pandas() + + expected_result = (pd_df["date_col"] - literal).astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + def test_date_series_diff_agg(scalars_dfs): bf_df, pd_df = scalars_dfs diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 723481b1d1..53cb5f7419 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -17,8 +17,10 @@ import operator import numpy as np +from packaging import version import pandas as pd import pandas.testing +import pyarrow as pa import pytest from bigframes import dtypes @@ -38,14 +40,22 @@ def temporal_dfs(session): pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), ], + "date_col": pd.Series( + [ + datetime.date(2000, 1, 1), + datetime.date(2001, 2, 3), + datetime.date(2020, 9, 30), + ], + dtype=pd.ArrowDtype(pa.date32()), + ), "timedelta_col_1": [ pd.Timedelta(5, "s"), - pd.Timedelta(-4, "d"), + pd.Timedelta(-4, "m"), pd.Timedelta(5, "h"), ], "timedelta_col_2": [ pd.Timedelta(3, "s"), - pd.Timedelta(-4, "d"), + pd.Timedelta(-4, "m"), pd.Timedelta(6, "h"), ], "numeric_col": [1.5, 2, -3], @@ -365,6 +375,81 @@ def test_timestamp_sub_dataframes(temporal_dfs): ) +@pytest.mark.parametrize( + ("left_col", "right_col"), + [ + ("date_col", "timedelta_col_1"), + ("timedelta_col_1", "date_col"), + ], +) +def test_date_add__series_add_series(temporal_dfs, left_col, right_col): + if version.Version(pd.__version__) < version.Version("2.1.0"): + pytest.skip("not supported by Pandas < 2.1.0") + + bf_df, pd_df = temporal_dfs + + actual_result = (bf_df[left_col] + bf_df[right_col]).to_pandas() + + expected_result = (pd_df[left_col] + pd_df[right_col]).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +# Pandas does not support date literal + timedelta series so we don't test it here. +def test_date_add__literal_add_series(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (literal + bf_df["date_col"]).to_pandas() + + expected_result = (literal + pd_df["date_col"]).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +# Pandas does not support timedelta series + date literal so we don't test it here. +def test_date_add__series_add_literal(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (bf_df["date_col"] + literal).to_pandas() + + expected_result = (pd_df["date_col"] + literal).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_sub__series_sub_series(temporal_dfs): + if version.Version(pd.__version__) < version.Version("2.1.0"): + pytest.skip("not supported by Pandas < 2.1.0") + + bf_df, pd_df = temporal_dfs + + actual_result = (bf_df["date_col"] - bf_df["timedelta_col_1"]).to_pandas() + + expected_result = (pd_df["date_col"] - pd_df["timedelta_col_1"]).astype( + dtypes.DATETIME_DTYPE + ) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_date_sub__series_sub_literal(temporal_dfs): + bf_df, pd_df = temporal_dfs + literal = pd.Timedelta(1, "d") + + actual_result = (bf_df["date_col"] - literal).to_pandas() + + expected_result = (pd_df["date_col"] - literal).astype(dtypes.DATETIME_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + @pytest.mark.parametrize( "compare_func", [