feature-engine · direkkakkar319-ops · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -86,6 +86,7 @@ celerybeat-schedule
 # Environments
 .env
 .venv
+.venv_wsl
 env/
 venv/
 ENV/

diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst
@@ -53,6 +53,7 @@ New transformers
 Enhancements
 ~~~~~~~~~~~~
 
+- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar <https://github.com/DirekKakkar>`_)
 - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli <https://github.com/solegalli>`_)
 
 BUG

diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py
@@ -1,33 +1,26 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import pandas as pd
 
-from feature_engine._check_init_parameters.check_variables import (
-    _check_variables_input_value,
-)
+from feature_engine._check_init_parameters.check_variables import \
+    _check_variables_input_value
 from feature_engine._docstrings.fit_attributes import (
-    _feature_names_in_docstring,
-    _imputer_dict_docstring,
-    _n_features_in_docstring,
-    _variables_attribute_docstring,
-)
-from feature_engine._docstrings.methods import (
-    _fit_transform_docstring,
-    _transform_imputers_docstring,
-)
+    _feature_names_in_docstring, _imputer_dict_docstring,
+    _n_features_in_docstring, _variables_attribute_docstring)
+from feature_engine._docstrings.methods import (_fit_transform_docstring,
+                                                _transform_imputers_docstring)
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import check_X
 from feature_engine.imputation.base_imputer import BaseImputer
 from feature_engine.tags import _return_tags
-from feature_engine.variable_handling import (
-    check_all_variables,
-    check_categorical_variables,
-    find_all_variables,
-    find_categorical_variables,
-)
+from feature_engine.variable_handling import (check_all_variables,
+                                              check_categorical_variables,
+                                              find_all_variables,
+                                              find_categorical_variables)
 
 
 @Substitution(
@@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer):
         type object or categorical. If True, the imputer will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    errors : str, default='raise'
+        Indicates what to do when the selected imputation_method='frequent'
+        and a variable has more than 1 mode.
+
+        If 'raise', raises a ValueError and stops the fit.
+
+        If 'warn', raises a UserWarning and continues, imputing using the
+        first most frequent category found.
+
+        If 'ignore', continues without warnings, imputing using the first
+        most frequent category found.
+
     Attributes
     ----------
     {imputer_dict_}
@@ -135,6 +140,7 @@ def __init__(
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         return_object: bool = False,
         ignore_format: bool = False,
+        errors: str = "raise",
     ) -> None:
         if imputation_method not in ["missing", "frequent"]:
             raise ValueError(
@@ -144,11 +150,18 @@ def __init__(
         if not isinstance(ignore_format, bool):
             raise ValueError("ignore_format takes only booleans True and False")
 
+        if errors not in ("raise", "warn", "ignore"):
+            raise ValueError(
+                "errors takes only values 'raise', 'warn', or 'ignore'. "
+                f"Got {errors} instead."
+            )
+
         self.imputation_method = imputation_method
         self.fill_value = fill_value
         self.variables = _check_variables_input_value(variables)
         self.return_object = return_object
         self.ignore_format = ignore_format
+        self.errors = errors
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
                 # Some variables may contain more than 1 mode:
                 if len(mode_vals) > 1:
-                    raise ValueError(
-                        f"The variable {var} contains multiple frequent categories."
-                    )
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable {var} contains multiple "
+                            f"frequent categories. Set errors='warn' or "
+                            f"errors='ignore' to allow imputation using "
+                            f"the first most frequent category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable {var} has multiple frequent "
+                            f"categories. The first category found, "
+                            f"{mode_vals[0]}, will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = {var: mode_vals[0]}
 
@@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                         varnames_str = ", ".join(varnames)
                     else:
                         varnames_str = varnames[0]
-                    raise ValueError(
-                        f"The variable(s) {varnames_str} contain(s) multiple frequent "
-                        f"categories."
-                    )
+
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable(s) {varnames_str} contain(s) "
+                            f"multiple frequent categories. Set "
+                            f"errors='warn' or errors='ignore' to allow "
+                            f"imputation using the first most frequent "
+                            f"category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable(s) {varnames_str} have multiple "
+                            f"frequent categories. The first category "
+                            f"found will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = mode_vals.iloc[0].to_dict()
 

diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py
@@ -1,9 +1,23 @@
+import warnings
+
+import numpy as np
 import pandas as pd
 import pytest
 
 from feature_engine.imputation import CategoricalImputer
 
 
+# --- Shared fixture: perfectly multimodal variable ---
+@pytest.fixture
+def multimodal_df():
+    return pd.DataFrame(
+        {
+            "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"],
+            "country": ["UK", "UK", "FR", "FR", "DE", "DE"],
+        }
+    )
+
+
 def test_impute_with_string_missing_and_automatically_find_variables(df_na):
     # set up transformer
     imputer = CategoricalImputer(imputation_method="missing", variables=None)
@@ -150,14 +164,22 @@ def test_error_when_imputation_method_not_frequent_or_missing():
 
 
 def test_error_when_variable_contains_multiple_modes(df_na):
-    msg = "The variable Name contains multiple frequent categories."
+    msg = (
+        "The variable Name contains multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
     imputer = CategoricalImputer(imputation_method="frequent", variables="Name")
     with pytest.raises(ValueError) as record:
         imputer.fit(df_na)
     # check that error message matches
     assert str(record.value) == msg
 
-    msg = "The variable(s) Name contain(s) multiple frequent categories."
+    msg = (
+        "The variable(s) Name contain(s) multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
     imputer = CategoricalImputer(imputation_method="frequent")
     with pytest.raises(ValueError) as record:
         imputer.fit(df_na)
@@ -166,7 +188,11 @@ def test_error_when_variable_contains_multiple_modes(df_na):
 
     df_ = df_na.copy()
     df_["Name_dup"] = df_["Name"]
-    msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories."
+    msg = (
+        "The variable(s) Name, Name_dup contain(s) multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
     imputer = CategoricalImputer(imputation_method="frequent")
     with pytest.raises(ValueError) as record:
         imputer.fit(df_)
@@ -305,3 +331,122 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format):
 
     # check that error message matches
     assert str(record.value) == msg
+
+
+def test_errors_raise_on_multimodal_is_default(multimodal_df):
+    """Default behaviour: raise ValueError on multimodal variable."""
+    imputer = CategoricalImputer(imputation_method="frequent")
+    with pytest.raises(ValueError, match="multiple frequent categories"):
+        imputer.fit(multimodal_df)
+
+
+def test_errors_warn_emits_userwarning(multimodal_df):
+    """errors='warn': UserWarning must be emitted."""
+    imputer = CategoricalImputer(imputation_method="frequent", errors="warn")
+    with pytest.warns(UserWarning, match="multiple frequent categories"):
+        imputer.fit(multimodal_df)
+
+
+def test_errors_warn_uses_first_mode(multimodal_df):
+    """errors='warn': imputer_dict_ should contain the first mode."""
+    imputer = CategoricalImputer(imputation_method="frequent", errors="warn")
+    with pytest.warns(UserWarning):
+        imputer.fit(multimodal_df)
+    expected = multimodal_df["city"].mode()[0]
+    assert imputer.imputer_dict_["city"] == expected
+
+
+def test_errors_ignore_no_warning_raised(multimodal_df):
+    """errors='ignore': no warnings should be emitted."""
+    imputer = CategoricalImputer(imputation_method="frequent", errors="ignore")
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")  # Promote all warnings to errors
+        imputer.fit(multimodal_df)  # Should NOT raise
+    assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0]
+
+
+def test_errors_invalid_value_raises():
+    """Passing an unsupported value for errors should raise ValueError at init."""
+    with pytest.raises(ValueError, match="errors takes only values"):
+        CategoricalImputer(imputation_method="frequent", errors="bad_value")
+
+
+def test_errors_param_ignored_when_imputation_method_is_missing():
+    """errors param has no effect for imputation_method='missing'."""
+    df = pd.DataFrame({"city": ["London", np.nan, "Paris"]})
+    imputer = CategoricalImputer(imputation_method="missing", errors="warn")
+    # Should fit without warnings since there's no mode computation
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        imputer.fit(df)
+
+
+def test_errors_ignore_single_variable():
+    """errors='ignore' on single multimodal variable — silent, uses first mode."""
+    X = pd.DataFrame(
+        {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]}
+    )
+    imputer = CategoricalImputer(imputation_method="frequent", errors="ignore")
+    imputer.fit(X)
+    assert imputer.imputer_dict_["city"] == X["city"].mode()[0]
+
+
+def test_errors_ignore_multiple_variables():
+    """errors='ignore' on multiple multimodal variables — silent, uses first mode."""
+    X = pd.DataFrame(
+        {
+            "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"],
+            "country": ["UK", "UK", "FR", "FR", "DE", "DE"],
+        }
+    )
+    imputer = CategoricalImputer(imputation_method="frequent", errors="ignore")
+    imputer.fit(X)
+    assert imputer.imputer_dict_["city"] == X["city"].mode()[0]
+    assert imputer.imputer_dict_["country"] == X["country"].mode()[0]
+
+
+# =============================================================================
+# NEW TESTS — added to fix codecov patch coverage (1 missing + 1 partial line)
+# =============================================================================
+
+def test_errors_warn_single_variable_emits_userwarning():
+    """
+    Covers the warnings.warn() inside the SINGLE-VARIABLE block of fit().
+
+    The existing test_errors_warn_emits_userwarning uses multimodal_df (2 columns),
+    which goes through the multi-variable code path. This test uses variables='city'
+    (a single variable) to hit the separate single-variable warn branch.
+    """
+    X = pd.DataFrame(
+        {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]}
+    )
+    imputer = CategoricalImputer(
+        imputation_method="frequent", variables="city", errors="warn"
+    )
+    with pytest.warns(UserWarning, match="multiple frequent categories"):
+        imputer.fit(X)
+    # First mode is used
+    assert imputer.imputer_dict_["city"] == X["city"].mode()[0]
+
+
+def test_errors_raise_one_multimodal_among_multiple_variables():
+    """
+    Covers the `varnames_str = varnames[0]` else-branch in the MULTI-VARIABLE block.
+
+    This branch is reached when multiple variables are selected but only ONE of them
+    turns out to have multiple modes. The existing tests either raise on all-multimodal
+    datasets (len(varnames) > 1) or use errors='ignore'/'warn' (skipping the raise).
+    Here we select two variables where only 'city' is multimodal, triggering the
+    singular else-branch before the ValueError is raised.
+    """
+    X = pd.DataFrame(
+        {
+            # 'city': 3 equally frequent values → multimodal
+            "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"],
+            # 'country': clear single mode (UK appears 3×, others once)
+            "country": ["UK", "UK", "UK", "FR", "DE", "SE"],
+        }
+    )
+    imputer = CategoricalImputer(imputation_method="frequent", errors="raise")
+    with pytest.raises(ValueError, match="city"):
+        imputer.fit(X)
-Original file line number
+Diff line change
@@ Expand Up / @@ -86,6 +86,7 @@ celerybeat-schedule @@
     # Environments
     .env
     .venv
+    .venv_wsl
     env/
     venv/
     ENV/
@@ Expand Down @@