diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 8fec253b24..3968e98a69 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -87,6 +87,7 @@ def __init__( kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, *, + allow_large_results: bool = True, ordering_mode: Literal["strict", "partial"] = "strict", client_endpoints_override: Optional[dict] = None, ): @@ -98,6 +99,7 @@ def __init__( self._application_name = application_name self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check + self._allow_large_results = allow_large_results self._session_started = False # Determines the ordering strictness for the session. self._ordering_mode = _validate_ordering_mode(ordering_mode) @@ -232,6 +234,26 @@ def skip_bq_connection_check(self, value: bool): ) self._skip_bq_connection_check = value + @property + def allow_large_results(self) -> bool: + """ + Sets the flag to allow or disallow query results larger than 10 GB. + + The default setting for this flag is True, which allows queries to return results + exceeding 10 GB by creating an explicit destination table. If set to False, it + restricts the result size to 10 GB, and BigQuery will raise an error if this limit + is exceeded. + + Returns: + bool: True if large results are allowed with an explicit destination table, + False if results are limited to 10 GB and errors are raised when exceeded. + """ + return self._allow_large_results + + @allow_large_results.setter + def allow_large_results(self, value: bool): + self._allow_large_results = value + @property def use_regional_endpoints(self) -> bool: """Flag to connect to regional API endpoints. diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index bbbd6b10fd..771b068548 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -112,6 +112,7 @@ class MaterializationOptions: downsampling: sampling_options.SamplingOptions = dataclasses.field( default_factory=sampling_options.SamplingOptions ) + allow_large_results: Optional[bool] = None ordered: bool = True @@ -479,9 +480,12 @@ def to_arrow( self, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> Tuple[pa.Table, bigquery.QueryJob]: """Run query and download results as a pyarrow Table.""" - execute_result = self.session._executor.execute(self.expr, ordered=ordered) + execute_result = self.session._executor.execute( + self.expr, ordered=ordered, use_explicit_destination=allow_large_results + ) pa_table = execute_result.to_arrow_table() pa_index_labels = [] @@ -503,6 +507,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> Tuple[pd.DataFrame, Optional[bigquery.QueryJob]]: """Run query and download results as a pandas DataFrame. @@ -545,7 +550,9 @@ def to_pandas( df, query_job = self._materialize_local( materialize_options=MaterializationOptions( - downsampling=sampling, ordered=ordered + downsampling=sampling, + allow_large_results=allow_large_results, + ordered=ordered, ) ) df.set_axis(self.column_labels, axis=1, copy=False) @@ -563,7 +570,10 @@ def try_peek( return None def to_pandas_batches( - self, page_size: Optional[int] = None, max_results: Optional[int] = None + self, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + allow_large_results: Optional[bool] = None, ): """Download results one message at a time. @@ -572,7 +582,7 @@ def to_pandas_batches( execute_result = self.session._executor.execute( self.expr, ordered=True, - use_explicit_destination=True, + use_explicit_destination=allow_large_results, page_size=page_size, max_results=max_results, ) @@ -601,7 +611,10 @@ def _materialize_local( """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. execute_result = self.session._executor.execute( - self.expr, ordered=materialize_options.ordered, get_size_bytes=True + self.expr, + ordered=materialize_options.ordered, + use_explicit_destination=materialize_options.allow_large_results, + get_size_bytes=True, ) assert execute_result.total_bytes is not None table_mb = execute_result.total_bytes / _BYTES_TO_MEGABYTES @@ -1698,7 +1711,7 @@ def transpose( original_row_index = ( original_row_index if original_row_index is not None - else self.index.to_pandas(ordered=True) + else self.index.to_pandas(ordered=True)[0] ) original_row_count = len(original_row_index) if original_row_count > bigframes.constants.MAX_COLUMNS: @@ -2657,14 +2670,22 @@ def column_ids(self) -> Sequence[str]: def is_null(self) -> bool: return len(self._block._index_columns) == 0 - def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: + def to_pandas( + self, + *, + ordered: Optional[bool] = None, + allow_large_results: Optional[bool] = None, + ) -> Tuple[pd.Index, Optional[bigquery.QueryJob]]: """Executes deferred operations and downloads the results.""" if len(self.column_ids) == 0: raise bigframes.exceptions.NullIndexError( "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." ) ordered = ordered if ordered is not None else True - return self._block.select_columns([]).to_pandas(ordered=ordered)[0].index + df, query_job = self._block.select_columns([]).to_pandas( + ordered=ordered, allow_large_results=allow_large_results + ) + return df.index, query_job def resolve_level(self, level: LevelsType) -> typing.Sequence[str]: if utils.is_list_like(level): diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index b3a07d33bc..3f48fd3db2 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -490,17 +490,28 @@ def __getitem__(self, key: int) -> typing.Any: else: raise NotImplementedError(f"Index key not supported {key}") - def to_pandas(self) -> pandas.Index: + def to_pandas(self, *, allow_large_results: Optional[bool] = None) -> pandas.Index: """Gets the Index as a pandas Index. + Args: + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. + Returns: pandas.Index: A pandas Index with all of the labels from this Index. """ - return self._block.index.to_pandas(ordered=True) + df, query_job = self._block.index.to_pandas( + ordered=True, allow_large_results=allow_large_results + ) + self._query_job = query_job + return df - def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: - return self.to_pandas().to_numpy(dtype, **kwargs) + def to_numpy(self, dtype=None, *, allow_large_results=None, **kwargs) -> np.ndarray: + return self.to_pandas(allow_large_results=allow_large_results).to_numpy( + dtype, **kwargs + ) __array__ = to_numpy diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8a3697c2f9..b1d24cdfa8 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1566,6 +1566,7 @@ def to_arrow( self, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> pyarrow.Table: """Write DataFrame to an Arrow table / record batch. @@ -1573,6 +1574,9 @@ def to_arrow( ordered (bool, default True): Determines whether the resulting Arrow table will be ordered. In some cases, unordered may result in a faster-executing query. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame. @@ -1580,7 +1584,9 @@ def to_arrow( msg = "to_arrow is in preview. Types and unnamed / duplicate name columns may change in future." warnings.warn(msg, category=bfe.PreviewWarning) - pa_table, query_job = self._block.to_arrow(ordered=ordered) + pa_table, query_job = self._block.to_arrow( + ordered=ordered, allow_large_results=allow_large_results + ) self._set_internal_query_job(query_job) return pa_table @@ -1591,6 +1597,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -1613,6 +1620,9 @@ def to_pandas( ordered (bool, default True): Determines whether the resulting pandas dataframe will be ordered. In some cases, unordered may result in a faster-executing query. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -1625,12 +1635,17 @@ def to_pandas( sampling_method=sampling_method, random_state=random_state, ordered=ordered, + allow_large_results=allow_large_results, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) def to_pandas_batches( - self, page_size: Optional[int] = None, max_results: Optional[int] = None + self, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + *, + allow_large_results: Optional[bool] = None, ) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame. @@ -1642,6 +1657,9 @@ def to_pandas_batches( The size of each batch. max_results (int, default None): If given, only download this many rows at maximum. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: Iterable[pandas.DataFrame]: @@ -1650,7 +1668,9 @@ def to_pandas_batches( see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable """ return self._block.to_pandas_batches( - page_size=page_size, max_results=max_results + page_size=page_size, + max_results=max_results, + allow_large_results=allow_large_results, ) def _compute_dry_run(self) -> bigquery.QueryJob: @@ -3564,6 +3584,7 @@ def to_csv( *, header: bool = True, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. @@ -3571,7 +3592,7 @@ def to_csv( # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not utils.is_gcs_path(path_or_buf): - pd_df = self.to_pandas() + pd_df = self.to_pandas(allow_large_results=allow_large_results) return pd_df.to_csv(path_or_buf, sep=sep, header=header, index=index) if "*" not in path_or_buf: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3603,10 +3624,11 @@ def to_json( *, lines: bool = False, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? if not utils.is_gcs_path(path_or_buf): - pd_df = self.to_pandas() + pd_df = self.to_pandas(allow_large_results=allow_large_results) return pd_df.to_json( path_or_buf, orient=orient, @@ -3634,7 +3656,11 @@ def to_json( ordering_id=bigframes.session._io.bigquery.IO_ORDERING_ID, ) query_job = self._session._executor.export_gcs( - export_array, id_overrides, path_or_buf, format="json", export_options={} + export_array, + id_overrides, + path_or_buf, + format="json", + export_options={}, ) self._set_internal_query_job(query_job) return None @@ -3738,9 +3764,17 @@ def to_gbq( return destination_table def to_numpy( - self, dtype=None, copy=False, na_value=None, **kwargs + self, + dtype=None, + copy=False, + na_value=None, + *, + allow_large_results=None, + **kwargs, ) -> numpy.ndarray: - return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) + return self.to_pandas(allow_large_results=allow_large_results).to_numpy( + dtype, copy, na_value, **kwargs + ) def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: if copy is False: @@ -3755,6 +3789,7 @@ def to_parquet( *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[bytes]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. @@ -3762,7 +3797,7 @@ def to_parquet( # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not utils.is_gcs_path(path): - pd_df = self.to_pandas() + pd_df = self.to_pandas(allow_large_results=allow_large_results) return pd_df.to_parquet(path, compression=compression, index=index) if "*" not in path: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3794,12 +3829,23 @@ def to_dict( "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> dict | list[dict]: - return self.to_pandas().to_dict(orient, into, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_dict(orient, into, **kwargs) # type: ignore - def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: - return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) + def to_excel( + self, + excel_writer, + sheet_name: str = "Sheet1", + *, + allow_large_results: Optional[bool] = None, + **kwargs, + ) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_excel( + excel_writer, sheet_name, **kwargs + ) def to_latex( self, @@ -3807,16 +3853,25 @@ def to_latex( columns: Sequence | None = None, header: bool | Sequence[str] = True, index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: - return self.to_pandas().to_latex( + return self.to_pandas(allow_large_results=allow_large_results).to_latex( buf, columns=columns, header=header, index=index, **kwargs # type: ignore ) def to_records( - self, index: bool = True, column_dtypes=None, index_dtypes=None + self, + index: bool = True, + column_dtypes=None, + index_dtypes=None, + *, + allow_large_results=None, ) -> numpy.recarray: - return self.to_pandas().to_records(index, column_dtypes, index_dtypes) + return self.to_pandas(allow_large_results=allow_large_results).to_records( + index, column_dtypes, index_dtypes + ) def to_string( self, @@ -3839,8 +3894,10 @@ def to_string( min_rows: int | None = None, max_colwidth: int | None = None, encoding: str | None = None, + *, + allow_large_results: Optional[bool] = None, ) -> str | None: - return self.to_pandas().to_string( + return self.to_pandas(allow_large_results=allow_large_results).to_string( buf, columns, # type: ignore col_space, @@ -3887,8 +3944,10 @@ def to_html( table_id: str | None = None, render_links: bool = False, encoding: str | None = None, + *, + allow_large_results: bool | None = None, ) -> str: - return self.to_pandas().to_html( + return self.to_pandas(allow_large_results=allow_large_results).to_html( buf, columns, # type: ignore col_space, @@ -3919,15 +3978,19 @@ def to_markdown( buf=None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: - return self.to_pandas().to_markdown(buf, mode, index, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode, index, **kwargs) # type: ignore - def to_pickle(self, path, **kwargs) -> None: - return self.to_pandas().to_pickle(path, **kwargs) + def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_pickle( + path, **kwargs + ) - def to_orc(self, path=None, **kwargs) -> bytes | None: - as_pandas = self.to_pandas() + def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | None: + as_pandas = self.to_pandas(allow_large_results=allow_large_results) # to_orc only works with default index as_pandas_default_index = as_pandas.reset_index() return as_pandas_default_index.to_orc(path, **kwargs) diff --git a/bigframes/series.py b/bigframes/series.py index 5a84dee32f..33ba6f8599 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -381,6 +381,7 @@ def to_pandas( random_state: Optional[int] = None, *, ordered: bool = True, + allow_large_results: Optional[bool] = None, ) -> pandas.Series: """Writes Series to pandas Series. @@ -403,6 +404,9 @@ def to_pandas( ordered (bool, default True): Determines whether the resulting pandas series will be ordered. In some cases, unordered may result in a faster-executing query. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large query results + over the default size limit of 10 GB. Returns: @@ -414,6 +418,7 @@ def to_pandas( sampling_method=sampling_method, random_state=random_state, ordered=ordered, + allow_large_results=allow_large_results, ) self._set_internal_query_job(query_job) series = df.squeeze(axis=1) @@ -1754,22 +1759,36 @@ def to_csv( *, header: bool = True, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: if utils.is_gcs_path(path_or_buf): return self.to_frame().to_csv( - path_or_buf, sep=sep, header=header, index=index + path_or_buf, + sep=sep, + header=header, + index=index, + allow_large_results=allow_large_results, ) else: - pd_series = self.to_pandas() + pd_series = self.to_pandas(allow_large_results=allow_large_results) return pd_series.to_csv( path_or_buf=path_or_buf, sep=sep, header=header, index=index ) - def to_dict(self, into: type[dict] = dict) -> typing.Mapping: - return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore + def to_dict( + self, + into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, + ) -> typing.Mapping: + return typing.cast(dict, self.to_pandas(allow_large_results=allow_large_results).to_dict(into)) # type: ignore - def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: - return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) + def to_excel( + self, excel_writer, sheet_name="Sheet1", *, allow_large_results=None, **kwargs + ) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_excel( + excel_writer, sheet_name, **kwargs + ) def to_json( self, @@ -1780,26 +1799,42 @@ def to_json( *, lines: bool = False, index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: if utils.is_gcs_path(path_or_buf): return self.to_frame().to_json( - path_or_buf=path_or_buf, orient=orient, lines=lines, index=index + path_or_buf=path_or_buf, + orient=orient, + lines=lines, + index=index, + allow_large_results=allow_large_results, ) else: - pd_series = self.to_pandas() + pd_series = self.to_pandas(allow_large_results=allow_large_results) return pd_series.to_json( path_or_buf=path_or_buf, orient=orient, lines=lines, index=index # type: ignore ) def to_latex( - self, buf=None, columns=None, header=True, index=True, **kwargs + self, + buf=None, + columns=None, + header=True, + index=True, + *, + allow_large_results=None, + **kwargs, ) -> typing.Optional[str]: - return self.to_pandas().to_latex( + return self.to_pandas(allow_large_results=allow_large_results).to_latex( buf, columns=columns, header=header, index=index, **kwargs ) - def tolist(self) -> _list: - return self.to_pandas().to_list() + def tolist( + self, + *, + allow_large_results: Optional[bool] = None, + ) -> _list: + return self.to_pandas(allow_large_results=allow_large_results).to_list() to_list = tolist to_list.__doc__ = inspect.getdoc(vendored_pandas_series.Series.tolist) @@ -1809,14 +1844,24 @@ def to_markdown( buf: typing.IO[str] | None = None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> typing.Optional[str]: - return self.to_pandas().to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore def to_numpy( - self, dtype=None, copy=False, na_value=None, **kwargs + self, + dtype=None, + copy=False, + na_value=None, + *, + allow_large_results=None, + **kwargs, ) -> numpy.ndarray: - return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) + return self.to_pandas(allow_large_results=allow_large_results).to_numpy( + dtype, copy, na_value, **kwargs + ) def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: if copy is False: @@ -1825,8 +1870,10 @@ def __array__(self, dtype=None, copy: Optional[bool] = None) -> numpy.ndarray: __array__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__array__) - def to_pickle(self, path, **kwargs) -> None: - return self.to_pandas().to_pickle(path, **kwargs) + def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: + return self.to_pandas(allow_large_results=allow_large_results).to_pickle( + path, **kwargs + ) def to_string( self, @@ -1840,8 +1887,10 @@ def to_string( name=False, max_rows=None, min_rows=None, + *, + allow_large_results=None, ) -> typing.Optional[str]: - return self.to_pandas().to_string( + return self.to_pandas(allow_large_results=allow_large_results).to_string( buf, na_rep, float_format, @@ -1854,8 +1903,12 @@ def to_string( min_rows, ) - def to_xarray(self): - return self.to_pandas().to_xarray() + def to_xarray( + self, + *, + allow_large_results: Optional[bool] = None, + ): + return self.to_pandas(allow_large_results=allow_large_results).to_xarray() def _throw_if_index_contains_duplicates( self, error_message: typing.Optional[str] = None diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index a293abea6a..7c9cec1fc5 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -61,6 +61,7 @@ _MAX_CLUSTER_COLUMNS = 4 # TODO: b/338258028 Enable pruning to reduce text size. ENABLE_PRUNING = False +MAX_SMALL_RESULT_BYTES = 10 * 1024 * 1024 * 1024 # 10G @dataclasses.dataclass(frozen=True) @@ -105,7 +106,7 @@ def execute( *, ordered: bool = True, col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, + use_explicit_destination: Optional[bool] = False, get_size_bytes: bool = False, page_size: Optional[int] = None, max_results: Optional[int] = None, @@ -242,11 +243,14 @@ def execute( *, ordered: bool = True, col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, + use_explicit_destination: Optional[bool] = False, get_size_bytes: bool = False, page_size: Optional[int] = None, max_results: Optional[int] = None, ): + if use_explicit_destination is None: + use_explicit_destination = bigframes.options.bigquery.allow_large_results + if bigframes.options.compute.enable_multi_query_execution: self._simplify_with_caching(array_value) @@ -274,11 +278,19 @@ def execute( def iterator_supplier(): return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient) - if get_size_bytes is True: + if get_size_bytes is True or use_explicit_destination: size_bytes = self.bqclient.get_table(query_job.destination).num_bytes else: size_bytes = None + if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES: + warnings.warn( + "The query result size has exceeded 10 GB. In BigFrames 2.0 and " + "later, you might need to manually set `allow_large_results=True` in " + "the IO method or adjust the BigFrames option: " + "`bigframes.options.bigquery.allow_large_results=True`.", + FutureWarning, + ) # Runs strict validations to ensure internal type predictions and ibis are completely in sync # Do not execute these validations outside of testing suite. if "PYTEST_CURRENT_TEST" in os.environ and len(col_id_overrides) == 0: @@ -337,6 +349,7 @@ def export_gcs( array_value, ordered=False, col_id_overrides=col_id_overrides, + use_explicit_destination=True, ).query_job result_table = query_job.destination export_data_statement = bq_io.create_export_data_statement( diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py new file mode 100644 index 0000000000..c055babce6 --- /dev/null +++ b/tests/system/large/test_dataframe_io.py @@ -0,0 +1,59 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings + +import google.api_core.exceptions +import pytest + +import bigframes + +WIKIPEDIA_TABLE = "bigquery-public-data.samples.wikipedia" +LARGE_TABLE_OPTION = "bigquery.allow_large_results" + + +def test_to_pandas_batches_raise_when_large_result_not_allowed(session): + with bigframes.option_context(LARGE_TABLE_OPTION, False), pytest.raises( + google.api_core.exceptions.Forbidden + ): + df = session.read_gbq(WIKIPEDIA_TABLE) + next(df.to_pandas_batches(page_size=500, max_results=1500)) + + +def test_to_pandas_batches_override_global_option( + session, +): + with bigframes.option_context(LARGE_TABLE_OPTION, False): + df = session.read_gbq(WIKIPEDIA_TABLE) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + next( + df.to_pandas_batches( + page_size=500, max_results=1500, allow_large_results=True + ) + ) + assert len(w) == 2 + assert issubclass(w[0].category, FutureWarning) + assert str(w[0].message).startswith( + "The query result size has exceeded 10 GB." + ) + + +def test_to_pandas_raise_when_large_result_not_allowed(session): + with bigframes.option_context(LARGE_TABLE_OPTION, False), pytest.raises( + google.api_core.exceptions.Forbidden + ): + df = session.read_gbq(WIKIPEDIA_TABLE) + next(df.to_pandas()) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ecece3462a..d73a7aaf4f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2223,8 +2223,14 @@ def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, ) @@ -2261,8 +2267,14 @@ def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index b07213f943..264ca36157 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -249,6 +249,30 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_to_pandas_override_global_option(scalars_df_index): + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + scalars_df_index.to_pandas() + assert scalars_df_index._query_job.destination.table_id.startswith("bqdf") + + # When allow_large_results=False, a destination table is implicitly created, + # table has 'anon' prefix. + scalars_df_index.to_pandas(allow_large_results=False) + assert scalars_df_index._query_job.destination.table_id.startswith("anon") + + +def test_to_arrow_override_global_option(scalars_df_index): + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + scalars_df_index.to_arrow() + assert scalars_df_index._query_job.destination.table_id.startswith("bqdf") + + # When allow_large_results=False, a destination table is implicitly created, + # table has 'anon' prefix. + scalars_df_index.to_arrow(allow_large_results=False) + assert scalars_df_index._query_job.destination.table_id.startswith("anon") + + def test_load_json_w_unboxed_py_value(session): sql = """ SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py new file mode 100644 index 0000000000..31818dfad8 --- /dev/null +++ b/tests/system/small/test_index_io.py @@ -0,0 +1,39 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_to_pandas_override_global_option(scalars_df_index): + bf_index = scalars_df_index.index + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_index.to_pandas() + assert bf_index._query_job.destination.table_id.startswith("bqdf") + + # When allow_large_results=False, a destination table is implicitly created, + # table has 'anon' prefix. + bf_index.to_pandas(allow_large_results=False) + assert bf_index._query_job.destination.table_id.startswith("anon") + + +def test_to_numpy_override_global_option(scalars_df_index): + bf_index = scalars_df_index.index + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_index.to_numpy() + assert bf_index._query_job.destination.table_id.startswith("bqdf") + + # When allow_large_results=False, a destination table is implicitly created, + # table has 'anon' prefix. + bf_index.to_numpy(allow_large_results=False) + assert bf_index._query_job.destination.table_id.startswith("anon") diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py new file mode 100644 index 0000000000..ed27246a80 --- /dev/null +++ b/tests/system/small/test_series_io.py @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_to_pandas_override_global_option(scalars_df_index): + bf_series = scalars_df_index["int64_col"] + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_series.to_pandas() + assert bf_series._query_job.destination.table_id.startswith("bqdf") + + # When allow_large_results=False, a destination table is implicitly created, + # table has 'anon' prefix. + bf_series.to_pandas(allow_large_results=False) + assert bf_series._query_job.destination.table_id.startswith("anon") diff --git a/tests/unit/polars_session.py b/tests/unit/polars_session.py index cffd8ff7ca..6cbb247587 100644 --- a/tests/unit/polars_session.py +++ b/tests/unit/polars_session.py @@ -40,7 +40,7 @@ def execute( *, ordered: bool = True, col_id_overrides: Mapping[str, str] = {}, - use_explicit_destination: bool = False, + use_explicit_destination: Optional[bool] = False, get_size_bytes: bool = False, page_size: Optional[int] = None, max_results: Optional[int] = None, diff --git a/tests/unit/resources.py b/tests/unit/resources.py index c091eac2a2..ebc1243eaf 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -24,6 +24,7 @@ import bigframes.clients import bigframes.core.ordering import bigframes.dataframe +import bigframes.series import bigframes.session.clients import bigframes.session.executor import bigframes.session.metrics diff --git a/tests/unit/test_dataframe_io.py b/tests/unit/test_dataframe_io.py new file mode 100644 index 0000000000..5deb0d7a24 --- /dev/null +++ b/tests/unit/test_dataframe_io.py @@ -0,0 +1,51 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock + +import pytest + +from . import resources + + +@pytest.fixture +def mock_df(monkeypatch: pytest.MonkeyPatch): + dataframe = resources.create_dataframe(monkeypatch) + monkeypatch.setattr(dataframe, "to_pandas", Mock()) + return dataframe + + +@pytest.mark.parametrize( + "api_name, kwargs", + [ + ("to_csv", {"allow_large_results": True}), + ("to_json", {"allow_large_results": True}), + ("to_numpy", {"allow_large_results": True}), + ("to_parquet", {"allow_large_results": True}), + ("to_dict", {"allow_large_results": True}), + ("to_excel", {"excel_writer": "abc", "allow_large_results": True}), + ("to_latex", {"allow_large_results": True}), + ("to_records", {"allow_large_results": True}), + ("to_string", {"allow_large_results": True}), + ("to_html", {"allow_large_results": True}), + ("to_markdown", {"allow_large_results": True}), + ("to_pickle", {"path": "abc", "allow_large_results": True}), + ("to_orc", {"allow_large_results": True}), + ], +) +def test_dataframe_to_pandas(mock_df, api_name, kwargs): + getattr(mock_df, api_name)(**kwargs) + mock_df.to_pandas.assert_called_once_with( + allow_large_results=kwargs["allow_large_results"] + ) diff --git a/tests/unit/test_series_io.py b/tests/unit/test_series_io.py new file mode 100644 index 0000000000..a97293d3da --- /dev/null +++ b/tests/unit/test_series_io.py @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock + +import pytest + +from . import resources + + +@pytest.fixture +def mock_series(monkeypatch: pytest.MonkeyPatch): + dataframe = resources.create_dataframe(monkeypatch) + series = dataframe["col"] + monkeypatch.setattr(series, "to_pandas", Mock()) + return series + + +@pytest.mark.parametrize( + "api_name, kwargs", + [ + ("to_csv", {"allow_large_results": True}), + ("to_dict", {"allow_large_results": True}), + ("to_excel", {"excel_writer": "abc", "allow_large_results": True}), + ("to_json", {"allow_large_results": True}), + ("to_latex", {"allow_large_results": True}), + ("to_list", {"allow_large_results": True}), + ("to_markdown", {"allow_large_results": True}), + ("to_numpy", {"allow_large_results": True}), + ("to_pickle", {"path": "abc", "allow_large_results": True}), + ("to_string", {"allow_large_results": True}), + ("to_xarray", {"allow_large_results": True}), + ], +) +def test_series_allow_large_results_param_passing(mock_series, api_name, kwargs): + getattr(mock_series, api_name)(**kwargs) + mock_series.to_pandas.assert_called_once_with( + allow_large_results=kwargs["allow_large_results"] + ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e296dcb9f6..e59232ee85 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -365,7 +365,15 @@ def from_records( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: + def to_numpy( + self, + dtype=None, + copy=False, + na_value=None, + *, + allow_large_results=None, + **kwargs, + ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -388,7 +396,9 @@ def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarra na_value (Any, default None): The value to use for missing values. The default value depends on dtype and the dtypes of the DataFrame columns. - + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: numpy.ndarray: The converted NumPy array. """ @@ -509,6 +519,7 @@ def to_parquet( *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, + allow_large_results: Optional[bool] = None, ) -> Optional[bytes]: """Write a DataFrame to the binary Parquet format. @@ -534,14 +545,16 @@ def to_parquet( should be formatted ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. - compression (str, default 'snappy'): Name of the compression to use. Use ``None`` for no compression. Supported options: ``'gzip'``, ``'snappy'``. - index (bool, default True): If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This parameter has + no effect when results are saved to Google Cloud Storage (GCS). Returns: None or bytes: @@ -560,6 +573,8 @@ def to_dict( "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> dict | list[dict]: """ @@ -613,11 +628,13 @@ def to_dict( in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. - index (bool, default True): Whether to include the index item (and index_names item if `orient` is 'tight') in the returned dictionary. Can only be ``False`` when `orient` is 'split' or 'tight'. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: dict or list of dict: Return a collections.abc.Mapping object representing the DataFrame. @@ -625,7 +642,14 @@ def to_dict( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: + def to_excel( + self, + excel_writer, + sheet_name: str = "Sheet1", + *, + allow_large_results: Optional[bool] = None, + **kwargs, + ) -> None: """ Write DataFrame to an Excel sheet. @@ -653,11 +677,21 @@ def to_excel(self, excel_writer, sheet_name: str = "Sheet1", **kwargs) -> None: File path or existing ExcelWriter. sheet_name (str, default 'Sheet1'): Name of sheet which will contain DataFrame. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_latex( - self, buf=None, columns=None, header=True, index=True, **kwargs + self, + buf=None, + columns=None, + header=True, + index=True, + *, + allow_large_results=None, + **kwargs, ) -> str | None: r""" Render object to a LaTeX tabular, longtable, or nested table. @@ -693,6 +727,9 @@ def to_latex( it is assumed to be aliases for the column names. index (bool, default True): Write row names (index). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: If buf is None, returns the result as a string. Otherwise returns @@ -701,7 +738,12 @@ def to_latex( raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_records( - self, index: bool = True, column_dtypes=None, index_dtypes=None + self, + index: bool = True, + column_dtypes=None, + index_dtypes=None, + *, + allow_large_results=None, ) -> np.recarray: """ Convert DataFrame to a NumPy record array. @@ -731,6 +773,9 @@ def to_records( If a string or type, the data type to store all index levels. If a dictionary, a mapping of index level names and indices (zero-indexed) to specific data types. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This mapping is applied only if `index=True`. @@ -761,6 +806,8 @@ def to_string( min_rows: int | None = None, max_colwidth: int | None = None, encoding: str | None = None, + *, + allow_large_results: Optional[bool] = None, ): """Render a DataFrame to a console-friendly tabular output. @@ -824,6 +871,9 @@ def to_string( Max width to truncate each column in characters. By default, no limit. encoding (str, default "utf-8"): Set character encoding. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: If buf is None, returns the result as a string. Otherwise returns @@ -856,6 +906,8 @@ def to_html( table_id: str | None = None, render_links: bool = False, encoding: str | None = None, + *, + allow_large_results: bool | None = None, ): """Render a DataFrame as an HTML table. @@ -948,6 +1000,9 @@ def to_html( Convert URLs to HTML links. encoding (str, default "utf-8"): Set character encoding. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: str or None: If buf is None, returns the result as a string. Otherwise @@ -960,6 +1015,8 @@ def to_markdown( buf=None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ): """Print DataFrame in Markdown-friendly format. @@ -983,6 +1040,9 @@ def to_markdown( Mode in which file is opened. index (bool, optional, default True): Add index (row) labels. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. **kwargs These parameters will be passed to `tabulate `_. @@ -992,7 +1052,7 @@ def to_markdown( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_pickle(self, path, **kwargs) -> None: + def to_pickle(self, path, *, allow_large_results, **kwargs) -> None: """Pickle (serialize) object to file. **Examples:** @@ -1007,10 +1067,13 @@ def to_pickle(self, path, **kwargs) -> None: Args: path (str): File path where the pickled object will be stored. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_orc(self, path=None, **kwargs) -> bytes | None: + def to_orc(self, path=None, *, allow_large_results=None, **kwargs) -> bytes | None: """ Write a DataFrame to the ORC format. @@ -1030,6 +1093,9 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: bytes or None: diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 9dae802b6e..ee35bfa429 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -223,6 +223,7 @@ def to_json( *, index: bool = True, lines: bool = False, + allow_large_results: Optional[bool] = None, ) -> Optional[str]: """Convert the object to a JSON string, written to Cloud Storage. @@ -278,6 +279,11 @@ def to_json( throw ValueError if incorrect 'orient' since others are not list-like. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This parameter has + no effect when results are saved to Google Cloud Storage (GCS). + Returns: None or str: If path_or_buf is None, returns the resulting json format as a @@ -289,7 +295,13 @@ def to_json( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_csv(self, path_or_buf, *, index: bool = True) -> Optional[str]: + def to_csv( + self, + path_or_buf, + *, + index: bool = True, + allow_large_results: Optional[bool] = None, + ) -> Optional[str]: """Write object to a comma-separated values (csv) file on Cloud Storage. Args: @@ -313,6 +325,11 @@ def to_csv(self, path_or_buf, *, index: bool = True) -> Optional[str]: index (bool, default True): If True, write row names (index). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. This parameter has + no effect when results are saved to Google Cloud Storage (GCS). + Returns: None or str: If path_or_buf is None, returns the resulting json format as a string. Otherwise returns None. diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 59504ee68c..c94f707671 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1061,13 +1061,16 @@ def drop_duplicates(self, *, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_numpy(self, dtype): + def to_numpy(self, dtype, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. Args: dtype: The dtype to pass to :meth:`numpy.asarray`. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. **kwargs: Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5e6f546d09..913a2e7c3e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -458,6 +458,8 @@ def to_string( name: bool = False, max_rows: int | None = None, min_rows: int | None = None, + *, + allow_large_results: Optional[bool] = None, ) -> str | None: """ Render a string representation of the Series. @@ -486,6 +488,9 @@ def to_string( min_rows (int, optional): The number of rows to display in a truncated repr (when number of rows is above `max_rows`). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: @@ -498,6 +503,8 @@ def to_markdown( buf: IO[str] | None = None, mode: str = "wt", index: bool = True, + *, + allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: """ @@ -537,6 +544,9 @@ def to_markdown( Buffer to write to. If None, the output is returned as a string. mode (str, optional): Mode in which file is opened, "wt" by default. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. index (bool, optional, default True): Add index (row) labels. @@ -546,7 +556,12 @@ def to_markdown( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_dict(self, into: type[dict] = dict) -> Mapping: + def to_dict( + self, + into: type[dict] = dict, + *, + allow_large_results: Optional[bool] = None, + ) -> Mapping: """ Convert Series to {label -> value} dict or dict-like object. @@ -573,6 +588,9 @@ def to_dict(self, into: type[dict] = dict) -> Mapping: object. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: collections.abc.Mapping: @@ -611,7 +629,13 @@ def to_frame(self, name=None) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_excel(self, excel_writer, sheet_name): + def to_excel( + self, + excel_writer, + sheet_name, + *, + allow_large_results=None, + ): """ Write Series to an Excel sheet. @@ -630,10 +654,22 @@ def to_excel(self, excel_writer, sheet_name): File path or existing ExcelWriter. sheet_name (str, default 'Sheet1'): Name of sheet to contain Series. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): + def to_latex( + self, + buf=None, + columns=None, + header=True, + index=True, + *, + allow_large_results=None, + **kwargs, + ): """ Render object to a LaTeX tabular, longtable, or nested table. @@ -647,6 +683,9 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): it is assumed to be aliases for the column names. index (bool, default True): Write row names (index). + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. Returns: str or None: @@ -655,7 +694,7 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def tolist(self) -> list: + def tolist(self, *, allow_large_results: Optional[bool] = None) -> list: """ Return a list of the values. @@ -678,6 +717,11 @@ def tolist(self) -> list: >>> s.to_list() [1, 2, 3] + Args: + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. + Returns: list: list of the values. @@ -686,7 +730,7 @@ def tolist(self) -> list: to_list = tolist - def to_numpy(self, dtype, copy=False, na_value=None): + def to_numpy(self, dtype, copy=False, na_value=None, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index. @@ -727,6 +771,9 @@ def to_numpy(self, dtype, copy=False, na_value=None): na_value (Any, optional): The value to use for missing values. The default value depends on `dtype` and the type of the array. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. ``**kwargs``: Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). @@ -738,7 +785,7 @@ def to_numpy(self, dtype, copy=False, na_value=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_pickle(self, path, **kwargs): + def to_pickle(self, path, *, allow_large_results=None, **kwargs): """ Pickle (serialize) object to file. @@ -776,13 +823,16 @@ def to_pickle(self, path, **kwargs): String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``write()`` function. File path where the pickled object will be stored. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow + large query results over the default size limit of 10 GB. Returns: None """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_xarray(self): + def to_xarray(self, *, allow_large_results=None): """ Return an xarray object from the pandas object. @@ -791,6 +841,9 @@ def to_xarray(self): Data in the pandas structure converted to Dataset if the object is a DataFrame, or a DataArray if the object is a Series. + allow_large_results (bool, default None): + If not None, overrides the global setting to allow or disallow large + query results over the default size limit of 10 GB. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)