feature-engine · direkkakkar319-ops · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -86,6 +86,7 @@ celerybeat-schedule
 # Environments
 .env
 .venv
+.venv_wsl
 env/
 venv/
 ENV/

diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst
@@ -53,6 +53,8 @@ New transformers
 Enhancements
 ~~~~~~~~~~~~
 
+- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar <https://github.com/DirekKakkar>`_)
+- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar <https://github.com/DirekKakkar>`_)
 - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli <https://github.com/solegalli>`_)
 
 BUG

diff --git a/feature_engine/encoding/base_encoder.py b/feature_engine/encoding/base_encoder.py
@@ -6,27 +6,21 @@
 from sklearn.utils.validation import check_is_fitted
 
 from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
-from feature_engine._check_init_parameters.check_variables import (
-    _check_variables_input_value,
-)
+from feature_engine._check_init_parameters.check_variables import \
+    _check_variables_input_value
 from feature_engine._docstrings.init_parameters.all_trasnformers import (
-    _missing_values_docstring,
-    _variables_categorical_docstring,
-)
-from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
+    _missing_values_docstring, _variables_categorical_docstring)
+from feature_engine._docstrings.init_parameters.encoders import \
+    _ignore_format_docstring
 from feature_engine._docstrings.substitute import Substitution
-from feature_engine.dataframe_checks import (
-    _check_optional_contains_na,
-    _check_X_matches_training_df,
-    check_X,
-)
+from feature_engine.dataframe_checks import (_check_optional_contains_na,
+                                             _check_X_matches_training_df,
+                                             check_X)
 from feature_engine.tags import _return_tags
-from feature_engine.variable_handling import (
-    check_all_variables,
-    check_categorical_variables,
-    find_all_variables,
-    find_categorical_variables,
-)
+from feature_engine.variable_handling import (check_all_variables,
+                                              check_categorical_variables,
+                                              find_all_variables,
+                                              find_categorical_variables)
 
 
 @Substitution(
@@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
     def _encode(self, X: pd.DataFrame) -> pd.DataFrame:
         # replace categories by the learned parameters
         for feature in self.encoder_dict_.keys():
+            # Detect unseen categories BEFORE mapping so we can name them
+            if self.unseen == "warn":
+                unseen_cats = set(X[feature].dropna().unique()) - set(
+                    self.encoder_dict_[feature].keys()
+                )
+                if unseen_cats:
+                    warnings.warn(
+                        f"Variable {feature!r} contains unseen categories: "
+                        f"{unseen_cats}. These will be encoded as NaN.",
+                        UserWarning,
+                    )
+
             X[feature] = X[feature].map(self.encoder_dict_[feature])
 
             # if original variables are cast as categorical, they will remain
@@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X):
                     "During the encoding, NaN values were introduced in the feature(s) "
                     f"{nan_columns_str}."
                 )
+            # 'warn': per-variable warnings were already issued in _encode before
+            # the mapping, so nothing more to do here.
 
     def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """Convert the encoded variable back to the original values.

diff --git a/feature_engine/encoding/count_frequency.py b/feature_engine/encoding/count_frequency.py
@@ -6,34 +6,26 @@
 import pandas as pd
 
 from feature_engine._docstrings.fit_attributes import (
-    _feature_names_in_docstring,
-    _n_features_in_docstring,
-    _variables_attribute_docstring,
-)
+    _feature_names_in_docstring, _n_features_in_docstring,
+    _variables_attribute_docstring)
 from feature_engine._docstrings.init_parameters.all_trasnformers import (
-    _missing_values_docstring,
-    _variables_categorical_docstring,
-)
+    _missing_values_docstring, _variables_categorical_docstring)
 from feature_engine._docstrings.init_parameters.encoders import (
-    _ignore_format_docstring,
-    _unseen_docstring,
-)
-from feature_engine._docstrings.methods import (
-    _fit_transform_docstring,
-    _inverse_transform_docstring,
-    _transform_encoders_docstring,
-)
+    _ignore_format_docstring, _unseen_docstring)
+from feature_engine._docstrings.methods import (_fit_transform_docstring,
+                                                _inverse_transform_docstring,
+                                                _transform_encoders_docstring)
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import check_X
 from feature_engine.encoding._helper_functions import check_parameter_unseen
-from feature_engine.encoding.base_encoder import (
-    CategoricalInitMixinNA,
-    CategoricalMethodsMixin,
-)
+from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA,
+                                                  CategoricalMethodsMixin)
 
 _unseen_docstring = (
     _unseen_docstring
     + """ If `'encode'`, unseen categories will be encoded as 0 (zero)."""
+    + """ If `'warn'`, unseen categories will be encoded as NaN and a"""
+    + """ UserWarning is raised listing the unseen categories per variable."""
 )
 
 
@@ -166,7 +158,7 @@ def __init__(
                 f"Got {encoding_method} instead."
             )
 
-        check_parameter_unseen(unseen, ["ignore", "raise", "encode"])
+        check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"])
         super().__init__(variables, missing_values, ignore_format)
         self.encoding_method = encoding_method
         self.unseen = unseen

diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py
@@ -1,33 +1,26 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import pandas as pd
 
-from feature_engine._check_init_parameters.check_variables import (
-    _check_variables_input_value,
-)
+from feature_engine._check_init_parameters.check_variables import \
+    _check_variables_input_value
 from feature_engine._docstrings.fit_attributes import (
-    _feature_names_in_docstring,
-    _imputer_dict_docstring,
-    _n_features_in_docstring,
-    _variables_attribute_docstring,
-)
-from feature_engine._docstrings.methods import (
-    _fit_transform_docstring,
-    _transform_imputers_docstring,
-)
+    _feature_names_in_docstring, _imputer_dict_docstring,
+    _n_features_in_docstring, _variables_attribute_docstring)
+from feature_engine._docstrings.methods import (_fit_transform_docstring,
+                                                _transform_imputers_docstring)
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import check_X
 from feature_engine.imputation.base_imputer import BaseImputer
 from feature_engine.tags import _return_tags
-from feature_engine.variable_handling import (
-    check_all_variables,
-    check_categorical_variables,
-    find_all_variables,
-    find_categorical_variables,
-)
+from feature_engine.variable_handling import (check_all_variables,
+                                              check_categorical_variables,
+                                              find_all_variables,
+                                              find_categorical_variables)
 
 
 @Substitution(
@@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer):
         type object or categorical. If True, the imputer will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    errors : str, default='raise'
+        Indicates what to do when the selected imputation_method='frequent'
+        and a variable has more than 1 mode.
+
+        If 'raise', raises a ValueError and stops the fit.
+
+        If 'warn', raises a UserWarning and continues, imputing using the
+        first most frequent category found.
+
+        If 'ignore', continues without warnings, imputing using the first
+        most frequent category found.
+
     Attributes
     ----------
     {imputer_dict_}
@@ -135,6 +140,7 @@ def __init__(
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         return_object: bool = False,
         ignore_format: bool = False,
+        errors: str = "raise",
     ) -> None:
         if imputation_method not in ["missing", "frequent"]:
             raise ValueError(
@@ -144,11 +150,18 @@ def __init__(
         if not isinstance(ignore_format, bool):
             raise ValueError("ignore_format takes only booleans True and False")
 
+        if errors not in ("raise", "warn", "ignore"):
+            raise ValueError(
+                "errors takes only values 'raise', 'warn', or 'ignore'. "
+                f"Got {errors} instead."
+            )
+
         self.imputation_method = imputation_method
         self.fill_value = fill_value
         self.variables = _check_variables_input_value(variables)
         self.return_object = return_object
         self.ignore_format = ignore_format
+        self.errors = errors
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
                 # Some variables may contain more than 1 mode:
                 if len(mode_vals) > 1:
-                    raise ValueError(
-                        f"The variable {var} contains multiple frequent categories."
-                    )
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable {var} contains multiple "
+                            f"frequent categories. Set errors='warn' or "
+                            f"errors='ignore' to allow imputation using "
+                            f"the first most frequent category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable {var} has multiple frequent "
+                            f"categories. The first category found, "
+                            f"{mode_vals[0]}, will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = {var: mode_vals[0]}
 
@@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                         varnames_str = ", ".join(varnames)
                     else:
                         varnames_str = varnames[0]
-                    raise ValueError(
-                        f"The variable(s) {varnames_str} contain(s) multiple frequent "
-                        f"categories."
-                    )
+
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable(s) {varnames_str} contain(s) "
+                            f"multiple frequent categories. Set "
+                            f"errors='warn' or errors='ignore' to allow "
+                            f"imputation using the first most frequent "
+                            f"category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable(s) {varnames_str} have multiple "
+                            f"frequent categories. The first category "
+                            f"found will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = mode_vals.iloc[0].to_dict()
 

diff --git a/tests/test_encoding/test_count_frequency_encoder.py b/tests/test_encoding/test_count_frequency_encoder.py
@@ -6,6 +6,7 @@
 from sklearn.exceptions import NotFittedError
 
 from feature_engine.encoding import CountFrequencyEncoder
+from feature_engine.encoding._helper_functions import check_parameter_unseen
 
 
 # init parameters
@@ -237,7 +238,7 @@ def test_no_error_triggered_when_df_contains_unseen_categories_and_unseen_is_enc
         encoder.transform(df_enc_rare)
 
 
-@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"])
+@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"])
 def test_fit_raises_error_if_df_contains_na(errors, df_enc_na):
     # test case 4: when dataset contains na, fit method
     encoder = CountFrequencyEncoder(unseen=errors)
@@ -251,7 +252,7 @@ def test_fit_raises_error_if_df_contains_na(errors, df_enc_na):
     assert str(record.value) == msg
 
 
-@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"])
+@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"])
 def test_transform_raises_error_if_df_contains_na(errors, df_enc, df_enc_na):
     # test case 4: when dataset contains na, transform method
     encoder = CountFrequencyEncoder(unseen=errors)
@@ -476,3 +477,88 @@ def test_inverse_transform_raises_non_fitted_error():
     # Test when fit is not called prior to transform.
     with pytest.raises(NotFittedError):
         enc.inverse_transform(df1)
+
+
+# ---------------------------------------------------------------------------
+# Tests for unseen='warn'
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def train_test_dfs_warn():
+    X_train = pd.DataFrame({"color": ["red", "red", "blue", "green", "blue"]})
+    X_test = pd.DataFrame({"color": ["red", "blue", "yellow"]})  # 'yellow' unseen
+    return X_train, X_test
+
+
+def test_unseen_warn_emits_userwarning(train_test_dfs_warn):
+    """unseen='warn': UserWarning emitted for unseen categories."""
+    X_train, X_test = train_test_dfs_warn
+    encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
+    encoder.fit(X_train)
+    with pytest.warns(UserWarning, match="unseen categories"):
+        encoder.transform(X_test)
+
+
+def test_unseen_warn_encodes_as_nan(train_test_dfs_warn):
+    """unseen='warn': unseen categories should become NaN."""
+    X_train, X_test = train_test_dfs_warn
+    encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
+    encoder.fit(X_train)
+    with pytest.warns(UserWarning):
+        X_tr = encoder.transform(X_test)
+    # 'yellow' is unseen — should be NaN
+    assert pd.isna(X_tr.loc[X_tr.index[2], "color"])
+
+
+def test_unseen_warn_known_categories_encoded_correctly(train_test_dfs_warn):
+    """unseen='warn': known categories still encoded correctly."""
+    X_train, X_test = train_test_dfs_warn
+    encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
+    encoder.fit(X_train)
+    with pytest.warns(UserWarning):
+        X_tr = encoder.transform(X_test)
+    # 'red' appears 2 times in training
+    assert X_tr.loc[X_tr.index[0], "color"] == 2
+
+
+def test_unseen_warn_no_warning_when_no_unseen(train_test_dfs_warn):
+    """unseen='warn': no warning if all categories were seen during fit."""
+    X_train, _ = train_test_dfs_warn
+    X_test_seen = pd.DataFrame({"color": ["red", "blue"]})
+    encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
+    encoder.fit(X_train)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")  # Fail if any warning raised
+        encoder.transform(X_test_seen)
+
+
+def test_unseen_invalid_value_raises():
+    """Invalid unseen value should raise ValueError at init."""
+    with pytest.raises(ValueError, match="takes only values"):
+        CountFrequencyEncoder(unseen="bad_value")
+
+
+# =============================================================================
+# NEW TESTS — added to fix codecov patch coverage
+# =============================================================================
+
+def test_check_parameter_unseen_raises_when_accepted_values_is_not_a_list():
+    """
+    Covers the first raise ValueError in check_parameter_unseen():
+
+        if not isinstance(accepted_values, list) or not all(
+            isinstance(item, str) for item in accepted_values
+        ):
+            raise ValueError("accepted_values should be a list of strings ...")
+
+    check_parameter_unseen() is an internal helper. CountFrequencyEncoder always
+    calls it with a hardcoded valid list, so the guard is never triggered through
+    normal usage — it must be tested by importing and calling the function directly.
+    """
+    # accepted_values is not a list at all
+    with pytest.raises(ValueError, match="accepted_values should be a list of strings"):
+        check_parameter_unseen("raise", "raise")
+
+    # accepted_values is a list but contains a non-string element
+    with pytest.raises(ValueError, match="accepted_values should be a list of strings"):
+        check_parameter_unseen("raise", ["raise", "ignore", 42])
-Original file line number
+Diff line change
@@ Expand Up / @@ -86,6 +86,7 @@ celerybeat-schedule @@
     # Environments
     .env
     .venv
+    .venv_wsl
     env/
     venv/
     ENV/
@@ Expand Down @@