From 9e467ae419b7636a867cd725d4ff0834edc805a6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 11 Nov 2023 00:30:23 +0000 Subject: [PATCH] fix: all sort operation are now stable --- bigframes/core/__init__.py | 8 ++--- bigframes/core/block_transforms.py | 4 +-- bigframes/core/blocks.py | 4 +-- bigframes/core/compile/compiled.py | 6 ++-- bigframes/core/compile/compiler.py | 2 +- bigframes/core/groupby/__init__.py | 4 --- bigframes/core/nodes.py | 1 - bigframes/core/ordering.py | 47 ++++++++++++++---------------- bigframes/dataframe.py | 4 +-- bigframes/series.py | 7 +---- 10 files changed, 32 insertions(+), 55 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index b640692bc8..b476961bdc 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -200,12 +200,8 @@ def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: ) ) - def order_by( - self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> ArrayValue: - return ArrayValue( - nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable) - ) + def order_by(self, by: Sequence[OrderingColumnReference]) -> ArrayValue: + return ArrayValue(nodes.OrderByNode(child=self.node, by=tuple(by))) def reversed(self) -> ArrayValue: return ArrayValue(nodes.ReversedNode(child=self.node)) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index e095f21f6b..ce0fdd219a 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -509,7 +509,7 @@ def nsmallest( ) for col_id in column_ids ] - block = block.order_by(order_refs, stable=True) + block = block.order_by(order_refs) if keep in ("first", "last"): return block.slice(0, n) else: # keep == "all": @@ -541,7 +541,7 @@ def nlargest( ) for col_id in column_ids ] - block = block.order_by(order_refs, stable=True) + block = block.order_by(order_refs) if keep in ("first", "last"): return block.slice(0, n) else: # keep == "all": diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6358d28e2e..f1113d938e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -235,10 +235,9 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]: def order_by( self, by: typing.Sequence[ordering.OrderingColumnReference], - stable: bool = False, ) -> Block: return Block( - self._expr.order_by(by, stable=stable), + self._expr.order_by(by), index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index.names, @@ -1596,7 +1595,6 @@ def merge( # sort uses coalesced join keys always joined_expr = joined_expr.order_by( [ordering.OrderingColumnReference(col_id) for col_id in coalesced_ids], - stable=True, ) joined_expr = joined_expr.select_columns(result_columns) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 4ba5e6bd08..78050ed4f0 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -753,11 +753,9 @@ def builder(self) -> OrderedIR.Builder: predicates=self._predicates, ) - def order_by( - self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> OrderedIR: + def order_by(self, by: Sequence[OrderingColumnReference]) -> OrderedIR: expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) + expr_builder.ordering = self._ordering.with_ordering_columns(by) return expr_builder.build() def reversed(self) -> OrderedIR: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 662e73a433..39892635f1 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -129,7 +129,7 @@ def compile_filter(node: nodes.FilterNode, ordered: bool = True): @_compile_node.register def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): if ordered: - return compile_ordered(node.child).order_by(node.by, node.stable) + return compile_ordered(node.child).order_by(node.by) else: return compile_unordered(node.child) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2a19a83dd5..22ef11dd19 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -217,7 +217,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, window_spec, self._selected_cols, drop_null_groups=self._dropna @@ -231,7 +230,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, window_spec, self._selected_cols, drop_null_groups=self._dropna @@ -552,7 +550,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, @@ -570,7 +567,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 8f1e2e5e73..050d356239 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -145,7 +145,6 @@ class FilterNode(UnaryNode): @dataclass(frozen=True) class OrderByNode(UnaryNode): by: Tuple[OrderingColumnReference, ...] - stable: bool = False @dataclass(frozen=True) diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 2cecd2fe7b..3ab89e0213 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -28,8 +28,6 @@ # Sufficient to store any value up to 2^63 DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE)) -STABLE_SORTS = ["mergesort", "stable"] - class OrderingDirection(Enum): ASC = 1 @@ -113,17 +111,12 @@ def with_non_sequential(self): def with_ordering_columns( self, ordering_value_columns: Sequence[OrderingColumnReference] = (), - stable: bool = False, ) -> ExpressionOrdering: """Creates a new ordering that reorders by the given columns. Args: ordering_value_columns: In decreasing precedence order, the values used to sort the ordering - stable: - If True, will use apply a stable sorting, using the old ordering where - the new ordering produces ties. Otherwise, ties will be resolved in - a performance maximizing way, Returns: Modified ExpressionOrdering @@ -131,29 +124,33 @@ def with_ordering_columns( col_ids_new = [ ordering_ref.column_id for ordering_ref in ordering_value_columns ] - if stable: - # Only reference each column once, so discard old referenc if there is a new reference - old_ordering_keep = [ - ordering_ref - for ordering_ref in self.ordering_value_columns - if ordering_ref.column_id not in col_ids_new - ] - else: - # New ordering needs to keep all total ordering columns no matter what. - # All other old ordering references can be discarded as does not need - # to be a stable sort. - old_ordering_keep = [ - ordering_ref - for ordering_ref in self.ordering_value_columns - if (ordering_ref.column_id not in col_ids_new) - and (ordering_ref.column_id in self.total_ordering_columns) - ] - new_ordering = (*ordering_value_columns, *old_ordering_keep) + old_ordering_keep = [ + ordering_ref + for ordering_ref in self.ordering_value_columns + if ordering_ref.column_id not in col_ids_new + ] + + # Truncate to remove any unneded col references after all total order cols included + new_ordering = self._truncate_ordering( + (*ordering_value_columns, *old_ordering_keep) + ) return ExpressionOrdering( new_ordering, total_ordering_columns=self.total_ordering_columns, ) + def _truncate_ordering( + self, order_refs: tuple[OrderingColumnReference, ...] + ) -> tuple[OrderingColumnReference, ...]: + total_order_cols_remaining = set(self.total_ordering_columns) + for i in range(len(order_refs)): + column = order_refs[i].column_id + if column in total_order_cols_remaining: + total_order_cols_remaining.remove(column) + if len(total_order_cols_remaining) == 0: + return order_refs[: i + 1] + raise ValueError("Ordering did not contain all total_order_cols") + def with_reverse(self): """Reverses the ordering.""" return ExpressionOrdering( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index bdbc00e620..4716de48d6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1262,9 +1262,7 @@ def sort_values( column_id, direction=direction, na_last=na_last ) ) - return DataFrame( - self._block.order_by(ordering, stable=kind in order.STABLE_SORTS) - ) + return DataFrame(self._block.order_by(ordering)) def value_counts( self, diff --git a/bigframes/series.py b/bigframes/series.py index 28290d591e..2cffdd5fce 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -35,11 +35,7 @@ import bigframes.core.groupby as groupby import bigframes.core.indexers import bigframes.core.indexes as indexes -from bigframes.core.ordering import ( - OrderingColumnReference, - OrderingDirection, - STABLE_SORTS, -) +from bigframes.core.ordering import OrderingColumnReference, OrderingDirection import bigframes.core.scalar as scalars import bigframes.core.utils as utils import bigframes.core.window @@ -1067,7 +1063,6 @@ def sort_values( na_last=(na_position == "last"), ) ], - stable=kind in STABLE_SORTS, ) return Series(block)