diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index ec495957d5..69334396b0 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -323,6 +323,25 @@ def custom_metric( mlflow_logging: boolean, default=True | Whether to log the training results to mlflow. This requires mlflow to be installed and to have an active mlflow run. FLAML will create nested runs. + multioutput_train_size: int, default=None | For multi-output tasks, when `eval_method` is set to + "holdout" and a validation set is manually specified, set this parameter to the length of + the training set. When calling the `fit` method, concatenate the training set and the validation set. + e.g., + + ```python + model = MultiOutputRegressor( + AutoML( + task="regression", + time_budget=1, + eval_method="holdout", + multioutput_train_size=len(X_train) + ) + ) + model.fit( + pd.concat([X_train, X_val]), + pd.concat([y_train, y_val]) + ) + ``` """ if ERROR: @@ -375,6 +394,7 @@ def custom_metric( settings["custom_hp"] = settings.get("custom_hp", {}) settings["skip_transform"] = settings.get("skip_transform", False) settings["mlflow_logging"] = settings.get("mlflow_logging", True) + settings["multioutput_train_size"] = settings.get("multioutput_train_size", None) self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor" @@ -1148,6 +1168,9 @@ def _prepare_data(self, eval_method, split_ratio, n_splits): ) self.data_size_full = self._state.data_size_full + def _train_val_split(self, train_val_concat, multioutput_train_size): + return train_val_concat[:multioutput_train_size], train_val_concat[multioutput_train_size:] + def fit( self, X_train=None, @@ -1524,6 +1547,16 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): self._state._start_time_flag = self._start_time_flag = time.time() task = task or self._settings.get("task") + multioutput_train_size = self._settings.get("multioutput_train_size") + if multioutput_train_size is not None: + if X_val is not None or y_val is not None: + raise ValueError( + "multioutput_train_size cannot be used together with explicitly provided " + "validation data (X_val and/or y_val). Please either remove multioutput_train_size " + "from settings or do not pass X_val/y_val." + ) + X_train, X_val = self._train_val_split(X_train, multioutput_train_size) + y_train, y_val = self._train_val_split(y_train, multioutput_train_size) if isinstance(task, str): task = task_factory(task, X_train, y_train) self._state.task = task diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py index 52c6b1d048..616b65e50d 100644 --- a/test/automl/test_regression.py +++ b/test/automl/test_regression.py @@ -230,5 +230,28 @@ def test_multioutput(): print(model.predict(X_test)) +def test_multioutput_train_size(): + import numpy as np + from sklearn.datasets import make_regression + from sklearn.model_selection import train_test_split + from sklearn.multioutput import MultiOutputRegressor + + # create regression data + X, y = make_regression(n_targets=3) + + # split into train and test data + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) + X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) + + # train the model + model = MultiOutputRegressor( + AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=len(X_train)) + ) + model.fit(np.concatenate([X_train, X_val], axis=0), np.concatenate([y_train, y_val], axis=0)) + + # predict + print(model.predict(X_test)) + + if __name__ == "__main__": unittest.main()