From 78b2349242df3cb45b9c40e1f906bf5a78a07a11 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 19 Mar 2024 00:56:36 +0000 Subject: [PATCH 1/3] feat: add params for LinearRegression model --- bigframes/ml/linear_model.py | 32 +++++++++++++---- bigframes/ml/sql.py | 4 ++- tests/system/large/ml/test_linear_model.py | 42 ++++++++++++++-------- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 68d1e12676..62855bae90 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -61,19 +61,25 @@ def __init__( "auto_strategy", "batch_gradient_descent", "normal_equation" ] = "normal_equation", fit_intercept: bool = True, + l1_reg: Optional[float] = None, l2_reg: float = 0.0, max_iterations: int = 20, + warm_start: bool = False, + learn_rate: Optional[float] = None, learn_rate_strategy: Literal["line_search", "constant"] = "line_search", early_stop: bool = True, min_rel_progress: float = 0.01, - ls_init_learn_rate: float = 0.1, + ls_init_learn_rate: Optional[float] = None, calculate_p_values: bool = False, enable_global_explain: bool = False, ): self.optimize_strategy = optimize_strategy self.fit_intercept = fit_intercept + self.l1_reg = l1_reg self.l2_reg = l2_reg self.max_iterations = max_iterations + self.warm_start = warm_start + self.learn_rate = learn_rate self.learn_rate_strategy = learn_rate_strategy self.early_stop = early_stop self.min_rel_progress = min_rel_progress @@ -99,17 +105,21 @@ def _from_bq( for bf_param, bf_value in dummy_linear.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) if bqml_param in last_fitting: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) + # Convert types + kwargs[bf_param] = ( + float(last_fitting[bqml_param]) + if bf_param in ["l1_reg", "learn_rate", "ls_init_learn_rate"] + else type(bf_value)(last_fitting[bqml_param]) + ) new_linear_regression = cls(**kwargs) new_linear_regression._bqml_model = core.BqmlModel(session, model) return new_linear_regression @property - def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: + def _bqml_options(self) -> dict: """The model options as they will be set for BQML""" - # TODO: Support l1_reg, warm_start, and learn_rate with error catching. - return { + options = { "model_type": "LINEAR_REG", "data_split_method": "NO_SPLIT", "optimize_strategy": self.optimize_strategy, @@ -119,10 +129,20 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "learn_rate_strategy": self.learn_rate_strategy, "early_stop": self.early_stop, "min_rel_progress": self.min_rel_progress, - "ls_init_learn_rate": self.ls_init_learn_rate, "calculate_p_values": self.calculate_p_values, "enable_global_explain": self.enable_global_explain, } + if self.l1_reg is not None: + options["l1_reg"] = self.l1_reg + if self.learn_rate is not None: + options["learn_rate"] = self.learn_rate + if self.ls_init_learn_rate is not None: + options["ls_init_learn_rate"] = self.ls_init_learn_rate + # Even presenting warm_start returns error for NORMAL_EQUATION optimizer + if self.warm_start is True: + options["warm_start"] = self.warm_start + + return options def _fit( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index fa74458e77..807fadc06a 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -38,7 +38,9 @@ def encode_value(self, v: Union[str, int, float, Iterable[str]]) -> str: inner = ", ".join([self.encode_value(x) for x in v]) return f"[{inner}]" else: - raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") + raise ValueError( + f"Unexpected value type {type(v)}. {constants.FEEDBACK_LINK}" + ) def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index a0f4182e6f..03fed00770 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -60,9 +60,11 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert reloaded_model.calculate_p_values is False assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learn_rate is None assert reloaded_model.learn_rate_strategy == "line_search" - assert reloaded_model.ls_init_learn_rate == 0.1 + assert reloaded_model.ls_init_learn_rate is None assert reloaded_model.max_iterations == 20 assert reloaded_model.min_rel_progress == 0.01 @@ -71,7 +73,14 @@ def test_linear_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): model = bigframes.ml.linear_model.LinearRegression( - fit_intercept=False, l2_reg=0.1, min_rel_progress=0.01 + fit_intercept=False, + l2_reg=0.2, + min_rel_progress=0.02, + l1_reg=0.2, + max_iterations=30, + optimize_strategy="batch_gradient_descent", + learn_rate_strategy="constant", + learn_rate=0.2, ) df = penguins_df_default_index.dropna() @@ -92,12 +101,12 @@ def test_linear_regression_customized_params_fit_score( result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { - "mean_absolute_error": [226.108411], - "mean_squared_error": [80459.668456], - "mean_squared_log_error": [0.00497], - "median_absolute_error": [171.618872], - "r2_score": [0.875415], - "explained_variance": [0.875417], + "mean_absolute_error": [240], + "mean_squared_error": [91197], + "mean_squared_log_error": [0.00573], + "median_absolute_error": [197], + "r2_score": [0.858], + "explained_variance": [0.8588], }, dtype="Float64", ) @@ -109,16 +118,21 @@ def test_linear_regression_customized_params_fit_score( assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) - assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.optimize_strategy == "BATCH_GRADIENT_DESCENT" assert reloaded_model.fit_intercept is False assert reloaded_model.calculate_p_values is False assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False - assert reloaded_model.l2_reg == 0.1 - assert reloaded_model.learn_rate_strategy == "line_search" - assert reloaded_model.ls_init_learn_rate == 0.1 - assert reloaded_model.max_iterations == 20 - assert reloaded_model.min_rel_progress == 0.01 + assert reloaded_model.l1_reg == 0.2 + assert reloaded_model.l2_reg == 0.2 + assert reloaded_model.ls_init_learn_rate is None + assert reloaded_model.max_iterations == 30 + assert reloaded_model.min_rel_progress == 0.02 + assert reloaded_model.learn_rate_strategy == "CONSTANT" + assert reloaded_model.learn_rate == 0.2 + + +# TODO(garrettwu): add tests for param warm_start. Requires a trained model. def test_logistic_regression_configure_fit_score(penguins_df_default_index, dataset_id): From 32a25513f0f864f575c434ad860b0f4da529a9ef Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 19 Mar 2024 17:02:07 +0000 Subject: [PATCH 2/3] fix tests --- tests/unit/ml/test_golden_sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index d63bc7aaa1..8996a9c77f 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -105,7 +105,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -115,7 +115,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) From c8f62910b9be00619067db3487c73372603e8256 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 19 Mar 2024 21:49:46 +0000 Subject: [PATCH 3/3] update docs --- .../bigframes_vendored/sklearn/linear_model/_base.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index ad2c872468..39012cbe08 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -71,18 +71,24 @@ class LinearRegression(RegressorMixin, LinearModel): Default ``True``. Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). + l1_reg (float or None, default None): + The amount of L1 regularization applied. Default to None. Can't be set in "normal_equation" mode. If unset, value 0 is used. l2_reg (float, default 0.0): The amount of L2 regularization applied. Default to 0. max_iterations (int, default 20): The maximum number of training iterations or steps. Default to 20. + warm_start (bool, default False): + Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run. Default to False. + learn_rate (float or None, default None): + The learn rate for gradient descent when learn_rate_strategy='constant'. If unset, value 0.1 is used. If learn_rate_strategy='line_search', an error is returned. learn_rate_strategy (str, default "line_search"): The strategy for specifying the learning rate during training. Default to "line_search". early_stop (bool, default True): Whether training should stop after the first iteration in which the relative loss improvement is less than the value specified for min_rel_progress. Default to True. min_rel_progress (float, default 0.01): The minimum relative loss improvement that is necessary to continue training when EARLY_STOP is set to true. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. Default to 0.01. - ls_init_learn_rate (float, default 0.1): - Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. Default to 0.1. + ls_init_learn_rate (float or None, default None): + Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. calculate_p_values (bool, default False): Specifies whether to compute p-values and standard errors during training. Default to False. enable_global_explain (bool, default False):