diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst index 3ee3222fb..7f9ed486a 100644 --- a/docs/whats_new/v_190.rst +++ b/docs/whats_new/v_190.rst @@ -53,6 +53,8 @@ New transformers Enhancements ~~~~~~~~~~~~ +- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar `_) +- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar `_) - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli `_) BUG diff --git a/feature_engine/encoding/base_encoder.py b/feature_engine/encoding/base_encoder.py index b4ae3478f..276bc1e26 100644 --- a/feature_engine/encoding/base_encoder.py +++ b/feature_engine/encoding/base_encoder.py @@ -6,27 +6,21 @@ from sklearn.utils.validation import check_is_fitted from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _missing_values_docstring, - _variables_categorical_docstring, -) -from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring + _missing_values_docstring, _variables_categorical_docstring) +from feature_engine._docstrings.init_parameters.encoders import \ + _ignore_format_docstring from feature_engine._docstrings.substitute import Substitution -from feature_engine.dataframe_checks import ( - _check_optional_contains_na, - _check_X_matches_training_df, - check_X, -) +from feature_engine.dataframe_checks import (_check_optional_contains_na, + _check_X_matches_training_df, + check_X) from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: def _encode(self, X: pd.DataFrame) -> pd.DataFrame: # replace categories by the learned parameters for feature in self.encoder_dict_.keys(): + # Detect unseen categories BEFORE mapping so we can name them + if self.unseen == "warn": + unseen_cats = set(X[feature].dropna().unique()) - set( + self.encoder_dict_[feature].keys() + ) + if unseen_cats: + warnings.warn( + f"Variable {feature!r} contains unseen categories: " + f"{unseen_cats}. These will be encoded as NaN.", + UserWarning, + ) + X[feature] = X[feature].map(self.encoder_dict_[feature]) # if original variables are cast as categorical, they will remain @@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X): "During the encoding, NaN values were introduced in the feature(s) " f"{nan_columns_str}." ) + # 'warn': per-variable warnings were already issued in _encode before + # the mapping, so nothing more to do here. def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: """Convert the encoded variable back to the original values. diff --git a/feature_engine/encoding/count_frequency.py b/feature_engine/encoding/count_frequency.py index ae6507627..854e3ea26 100644 --- a/feature_engine/encoding/count_frequency.py +++ b/feature_engine/encoding/count_frequency.py @@ -6,34 +6,26 @@ import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring, + _variables_attribute_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _missing_values_docstring, - _variables_categorical_docstring, -) + _missing_values_docstring, _variables_categorical_docstring) from feature_engine._docstrings.init_parameters.encoders import ( - _ignore_format_docstring, - _unseen_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _inverse_transform_docstring, - _transform_encoders_docstring, -) + _ignore_format_docstring, _unseen_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _inverse_transform_docstring, + _transform_encoders_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.encoding._helper_functions import check_parameter_unseen -from feature_engine.encoding.base_encoder import ( - CategoricalInitMixinNA, - CategoricalMethodsMixin, -) +from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA, + CategoricalMethodsMixin) _unseen_docstring = ( _unseen_docstring + """ If `'encode'`, unseen categories will be encoded as 0 (zero).""" + + """ If `'warn'`, unseen categories will be encoded as NaN and a""" + + """ UserWarning is raised listing the unseen categories per variable.""" ) @@ -166,7 +158,7 @@ def __init__( f"Got {encoding_method} instead." ) - check_parameter_unseen(unseen, ["ignore", "raise", "encode"]) + check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"]) super().__init__(variables, missing_values, ignore_format) self.encoding_method = encoding_method self.unseen = unseen diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..2d1f48e97 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -1,33 +1,26 @@ # Authors: Soledad Galli # License: BSD 3 clause +import warnings from typing import List, Optional, Union import pandas as pd -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _imputer_dict_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _transform_imputers_docstring, -) + _feature_names_in_docstring, _imputer_dict_docstring, + _n_features_in_docstring, _variables_attribute_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _transform_imputers_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.imputation.base_imputer import BaseImputer from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer): type object or categorical. If True, the imputer will select all variables or accept all variables entered by the user, including those cast as numeric. + errors : str, default='raise' + Indicates what to do when the selected imputation_method='frequent' + and a variable has more than 1 mode. + + If 'raise', raises a ValueError and stops the fit. + + If 'warn', raises a UserWarning and continues, imputing using the + first most frequent category found. + + If 'ignore', continues without warnings, imputing using the first + most frequent category found. + Attributes ---------- {imputer_dict_} @@ -135,6 +140,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, return_object: bool = False, ignore_format: bool = False, + errors: str = "raise", ) -> None: if imputation_method not in ["missing", "frequent"]: raise ValueError( @@ -144,11 +150,18 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + "errors takes only values 'raise', 'warn', or 'ignore'. " + f"Got {errors} instead." + ) + self.imputation_method = imputation_method self.fill_value = fill_value self.variables = _check_variables_input_value(variables) self.return_object = return_object self.ignore_format = ignore_format + self.errors = errors def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Some variables may contain more than 1 mode: if len(mode_vals) > 1: - raise ValueError( - f"The variable {var} contains multiple frequent categories." - ) + if self.errors == "raise": + raise ValueError( + f"The variable {var} contains multiple " + f"frequent categories. Set errors='warn' or " + f"errors='ignore' to allow imputation using " + f"the first most frequent category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable {var} has multiple frequent " + f"categories. The first category found, " + f"{mode_vals[0]}, will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = {var: mode_vals[0]} @@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): varnames_str = ", ".join(varnames) else: varnames_str = varnames[0] - raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories." - ) + + if self.errors == "raise": + raise ValueError( + f"The variable(s) {varnames_str} contain(s) " + f"multiple frequent categories. Set " + f"errors='warn' or errors='ignore' to allow " + f"imputation using the first most frequent " + f"category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable(s) {varnames_str} have multiple " + f"frequent categories. The first category " + f"found will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = mode_vals.iloc[0].to_dict() diff --git a/feature_engine/imputation/drop_missing_data.py b/feature_engine/imputation/drop_missing_data.py index 07c6f3e75..56e77156a 100644 --- a/feature_engine/imputation/drop_missing_data.py +++ b/feature_engine/imputation/drop_missing_data.py @@ -45,15 +45,14 @@ class DropMissingData(BaseImputer, TransformXyMixin): will check missing data in all variables in the dataframe. Alternatively, the imputer will evaluate missing data only in the variables in the list. - Note that if `missing_only=True`, missing data will be removed from variables - that had missing data in the train set. These might be a subset of the - variables indicated in the list. + If a list of variables is provided, ``missing_only`` must be set to ``False``. missing_only: bool, default=True If `True`, rows will be dropped when they show missing data in variables that - had missing data during `fit()`. If `False`, rows will be dropped if there is - missing data in any of the variables. This parameter only works when - `threshold=None`, otherwise it is ignored. + had missing data during `fit()`. Only valid when ``variables=None``. If + `False`, rows will be dropped if there is missing data in any of the + variables. This parameter only works when `threshold=None`, otherwise it is + ignored. threshold: int or float, default=None Require that percentage of non-NA values in a row to keep it. If @@ -131,6 +130,13 @@ def __init__( self.missing_only = missing_only self.threshold = threshold + if self.variables is not None and missing_only is True: + raise ValueError( + "variables and missing_only cannot be used together. " + "Set variables=None to use missing_only=True, or set " + "missing_only=False to pass a list of variables." + ) + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Find the variables for which missing data should be evaluated to decide if a diff --git a/feature_engine/imputation/missing_indicator.py b/feature_engine/imputation/missing_indicator.py index 01660a654..d7d5fffef 100644 --- a/feature_engine/imputation/missing_indicator.py +++ b/feature_engine/imputation/missing_indicator.py @@ -50,14 +50,15 @@ class AddMissingIndicator(BaseImputer): data or to all variables. **True**: indicators will be created only for those variables that showed - missing data during `fit()`. + missing data during `fit()`. Only valid when ``variables=None``. - **False**: indicators will be created for all variables + **False**: indicators will be created for all variables passed in + ``variables``, or all variables in the dataset if ``variables=None``. variables: list, default=None The list of variables to impute. If None, the imputer will find and - select all variables. - + select all variables. If a list of variables is provided, + ``missing_only`` must be set to ``False``. Attributes ---------- @@ -111,6 +112,13 @@ def __init__( self.variables = _check_variables_input_value(variables) self.missing_only = missing_only + if self.variables is not None and missing_only is True: + raise ValueError( + "variables and missing_only cannot be used together. " + "Set variables=None to use missing_only=True, or set " + "missing_only=False to pass a list of variables." + ) + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the variables for which the missing indicators will be created. diff --git a/tests/test_encoding/test_count_frequency_encoder.py b/tests/test_encoding/test_count_frequency_encoder.py index 55e13b1cc..feb4a5a5d 100644 --- a/tests/test_encoding/test_count_frequency_encoder.py +++ b/tests/test_encoding/test_count_frequency_encoder.py @@ -237,7 +237,7 @@ def test_no_error_triggered_when_df_contains_unseen_categories_and_unseen_is_enc encoder.transform(df_enc_rare) -@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"]) +@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"]) def test_fit_raises_error_if_df_contains_na(errors, df_enc_na): # test case 4: when dataset contains na, fit method encoder = CountFrequencyEncoder(unseen=errors) @@ -251,7 +251,7 @@ def test_fit_raises_error_if_df_contains_na(errors, df_enc_na): assert str(record.value) == msg -@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"]) +@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"]) def test_transform_raises_error_if_df_contains_na(errors, df_enc, df_enc_na): # test case 4: when dataset contains na, transform method encoder = CountFrequencyEncoder(unseen=errors) @@ -476,3 +476,62 @@ def test_inverse_transform_raises_non_fitted_error(): # Test when fit is not called prior to transform. with pytest.raises(NotFittedError): enc.inverse_transform(df1) + + +# --------------------------------------------------------------------------- +# Tests for unseen='warn' +# --------------------------------------------------------------------------- + +@pytest.fixture +def train_test_dfs_warn(): + X_train = pd.DataFrame({"color": ["red", "red", "blue", "green", "blue"]}) + X_test = pd.DataFrame({"color": ["red", "blue", "yellow"]}) # 'yellow' unseen + return X_train, X_test + + +def test_unseen_warn_emits_userwarning(train_test_dfs_warn): + """unseen='warn': UserWarning emitted for unseen categories.""" + X_train, X_test = train_test_dfs_warn + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with pytest.warns(UserWarning, match="unseen categories"): + encoder.transform(X_test) + + +def test_unseen_warn_encodes_as_nan(train_test_dfs_warn): + """unseen='warn': unseen categories should become NaN.""" + X_train, X_test = train_test_dfs_warn + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with pytest.warns(UserWarning): + X_tr = encoder.transform(X_test) + # 'yellow' is unseen — should be NaN + assert pd.isna(X_tr.loc[X_tr.index[2], "color"]) + + +def test_unseen_warn_known_categories_encoded_correctly(train_test_dfs_warn): + """unseen='warn': known categories still encoded correctly.""" + X_train, X_test = train_test_dfs_warn + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with pytest.warns(UserWarning): + X_tr = encoder.transform(X_test) + # 'red' appears 2 times in training + assert X_tr.loc[X_tr.index[0], "color"] == 2 + + +def test_unseen_warn_no_warning_when_no_unseen(train_test_dfs_warn): + """unseen='warn': no warning if all categories were seen during fit.""" + X_train, _ = train_test_dfs_warn + X_test_seen = pd.DataFrame({"color": ["red", "blue"]}) + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with warnings.catch_warnings(): + warnings.simplefilter("error") # Fail if any warning raised + encoder.transform(X_test_seen) + + +def test_unseen_invalid_value_raises(): + """Invalid unseen value should raise ValueError at init.""" + with pytest.raises(ValueError, match="takes only values"): + CountFrequencyEncoder(unseen="bad_value") diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 182e8826b..1c0640a58 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,9 +1,23 @@ +import warnings + +import numpy as np import pandas as pd import pytest from feature_engine.imputation import CategoricalImputer +# --- Shared fixture: perfectly multimodal variable --- +@pytest.fixture +def multimodal_df(): + return pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) + + def test_impute_with_string_missing_and_automatically_find_variables(df_na): # set up transformer imputer = CategoricalImputer(imputation_method="missing", variables=None) @@ -150,14 +164,22 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): - msg = "The variable Name contains multiple frequent categories." + msg = ( + "The variable Name contains multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") with pytest.raises(ValueError) as record: imputer.fit(df_na) # check that error message matches assert str(record.value) == msg - msg = "The variable(s) Name contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_na) @@ -166,7 +188,11 @@ def test_error_when_variable_contains_multiple_modes(df_na): df_ = df_na.copy() df_["Name_dup"] = df_["Name"] - msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_) @@ -305,3 +331,88 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format): # check that error message matches assert str(record.value) == msg + + +def test_errors_raise_on_multimodal_is_default(multimodal_df): + """Default behaviour: raise ValueError on multimodal variable.""" + imputer = CategoricalImputer(imputation_method="frequent") + with pytest.raises(ValueError, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_emits_userwarning(multimodal_df): + """errors='warn': UserWarning must be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_uses_first_mode(multimodal_df): + """errors='warn': imputer_dict_ should contain the first mode.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning): + imputer.fit(multimodal_df) + expected = multimodal_df["city"].mode()[0] + assert imputer.imputer_dict_["city"] == expected + + +def test_errors_ignore_no_warning_raised(multimodal_df): + """errors='ignore': no warnings should be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + with warnings.catch_warnings(): + warnings.simplefilter("error") # Promote all warnings to errors + imputer.fit(multimodal_df) # Should NOT raise + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_errors_invalid_value_raises(): + """Passing an unsupported value for errors should raise ValueError at init.""" + with pytest.raises(ValueError, match="errors takes only values"): + CategoricalImputer(imputation_method="frequent", errors="bad_value") + + +def test_errors_param_ignored_when_imputation_method_is_missing(): + """errors param has no effect for imputation_method='missing'.""" + df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) + imputer = CategoricalImputer(imputation_method="missing", errors="warn") + # Should fit without warnings since there's no mode computation + with warnings.catch_warnings(): + warnings.simplefilter("error") + imputer.fit(df) + + +def test_errors_ignore_single_variable(): + """errors='ignore' on single multimodal variable — silent, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + + +def test_errors_ignore_multiple_variables(): + """errors='ignore' on multiple multimodal variables — silent, uses first mode.""" + X = pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + assert imputer.imputer_dict_["country"] == X["country"].mode()[0] + + +def test_errors_warn_single_variable(): + """errors='warn' on single multimodal variable — warns, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city"], errors="warn" + ) + with pytest.warns(UserWarning, match="Variable city has multiple frequent"): + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] diff --git a/tests/test_imputation/test_drop_missing_data.py b/tests/test_imputation/test_drop_missing_data.py index ee49fee82..413e6a279 100644 --- a/tests/test_imputation/test_drop_missing_data.py +++ b/tests/test_imputation/test_drop_missing_data.py @@ -48,11 +48,11 @@ def test_selelct_all_variables_when_variables_is_none(df_na): def test_detect_variables_with_na_in_variables_entered_by_user(df_na): imputer = DropMissingData( - missing_only=True, variables=["City", "Studies", "Age", "dob"] + missing_only=False, variables=["City", "Studies", "Age", "dob"] ) X_transformed = imputer.fit_transform(df_na) assert imputer.variables == ["City", "Studies", "Age", "dob"] - assert imputer.variables_ == ["City", "Studies", "Age"] + assert imputer.variables_ == ["City", "Studies", "Age", "dob"] assert X_transformed.shape == (6, 6) @@ -60,7 +60,7 @@ def test_return_na_data_method(df_na): # test with vars imputer = DropMissingData( - threshold=0.5, variables=["City", "Studies", "Age", "Marks"] + missing_only=False, threshold=0.5, variables=["City", "Studies", "Age", "Marks"] ) imputer.fit_transform(df_na) X_nona = imputer.return_na_data(df_na) @@ -125,13 +125,37 @@ def test_threshold_value_error(df_na): def test_threshold_with_variables(df_na): # Each row must have 100% data avaiable for columns ['Marks'] - imputer = DropMissingData(threshold=1, variables=["Marks"]) + imputer = DropMissingData(missing_only=False, threshold=1, variables=["Marks"]) X = imputer.fit_transform(df_na) assert list(X.index) == [0, 1, 2, 4, 6, 7] # Each row must have 25% data avaiable for ['City', 'Studies', 'Age', 'Marks'] imputer = DropMissingData( - threshold=0.75, variables=["City", "Studies", "Age", "Marks"] + missing_only=False, + threshold=0.75, + variables=["City", "Studies", "Age", "Marks"] ) X = imputer.fit_transform(df_na) assert list(X.index) == [0, 1, 4, 5, 6, 7] + + +# --------------------------------------------------------------------------- +# Tests for variables + missing_only mutual exclusivity +# --------------------------------------------------------------------------- + +def test_error_when_variables_and_missing_only_true(): + """Passing both variables and missing_only=True should raise ValueError.""" + with pytest.raises(ValueError, match="variables and missing_only"): + DropMissingData(missing_only=True, variables=["Age", "Name"]) + + +def test_no_error_when_variables_and_missing_only_false(): + """variables + missing_only=False is valid — should not raise.""" + imputer = DropMissingData(missing_only=False, variables=["Age"]) + assert imputer.variables is not None + + +def test_no_error_when_variables_none_and_missing_only_true(): + """variables=None + missing_only=True is valid — default case.""" + imputer = DropMissingData(missing_only=True, variables=None) + assert imputer.missing_only is True diff --git a/tests/test_imputation/test_missing_indicator.py b/tests/test_imputation/test_missing_indicator.py index a7f6e9f7c..2dd7af89e 100644 --- a/tests/test_imputation/test_missing_indicator.py +++ b/tests/test_imputation/test_missing_indicator.py @@ -1,8 +1,8 @@ import warnings + import numpy as np import pandas as pd import pytest - from sklearn.pipeline import Pipeline from feature_engine.imputation import AddMissingIndicator @@ -35,7 +35,7 @@ def test_add_indicators_to_all_variables_when_variables_is_none(df_na): def test_add_indicators_to_one_variable(df_na): - imputer = AddMissingIndicator(variables="Name") + imputer = AddMissingIndicator(missing_only=False, variables="Name") X_transformed = imputer.fit_transform(df_na) assert imputer.variables_ == ["Name"] assert X_transformed.shape == (8, 7) @@ -45,14 +45,14 @@ def test_add_indicators_to_one_variable(df_na): def test_detect_variables_with_missing_data_in_variables_entered_by_user(df_na): imputer = AddMissingIndicator( - missing_only=True, variables=["City", "Studies", "Age", "dob"] + missing_only=False, variables=["City", "Studies", "Age", "dob"] ) X_transformed = imputer.fit_transform(df_na) assert imputer.variables == ["City", "Studies", "Age", "dob"] - assert imputer.variables_ == ["City", "Studies", "Age"] - assert X_transformed.shape == (8, 9) + assert imputer.variables_ == ["City", "Studies", "Age", "dob"] + assert X_transformed.shape == (8, 10) assert "City_na" in X_transformed.columns - assert "dob_na" not in X_transformed.columns + assert "dob_na" in X_transformed.columns assert X_transformed["City_na"].sum() == 2 @@ -123,3 +123,25 @@ def test_no_performance_warning_with_many_variables(): issubclass(w.category, pd.errors.PerformanceWarning) for w in captured ), "PerformanceWarning was raised during transform" + + +# --------------------------------------------------------------------------- +# Tests for variables + missing_only mutual exclusivity +# --------------------------------------------------------------------------- + +def test_error_when_variables_and_missing_only_true(): + """Passing both variables and missing_only=True should raise ValueError.""" + with pytest.raises(ValueError, match="variables and missing_only"): + AddMissingIndicator(missing_only=True, variables=["Age", "Name"]) + + +def test_no_error_when_variables_and_missing_only_false(): + """variables + missing_only=False is valid — should not raise.""" + imputer = AddMissingIndicator(missing_only=False, variables=["Age"]) + assert imputer.variables is not None + + +def test_no_error_when_variables_none_and_missing_only_true(): + """variables=None + missing_only=True is valid — default case.""" + imputer = AddMissingIndicator(missing_only=True, variables=None) + assert imputer.missing_only is True