Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ celerybeat-schedule
# Environments
.env
.venv
.venv_wsl
env/
venv/
ENV/
Expand Down
1 change: 1 addition & 0 deletions docs/whats_new/v_190.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ New transformers
Enhancements
~~~~~~~~~~~~

- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar <https://github.com/DirekKakkar>`_)
- Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli <https://github.com/solegalli>`_)

BUG
Expand Down
86 changes: 61 additions & 25 deletions feature_engine/imputation/categorical.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._check_init_parameters.check_variables import \
_check_variables_input_value
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_imputer_dict_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_transform_imputers_docstring,
)
_feature_names_in_docstring, _imputer_dict_docstring,
_n_features_in_docstring, _variables_attribute_docstring)
from feature_engine._docstrings.methods import (_fit_transform_docstring,
_transform_imputers_docstring)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import (
check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables,
)
from feature_engine.variable_handling import (check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables)


@Substitution(
Expand Down Expand Up @@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer):
type object or categorical. If True, the imputer will select all variables or
accept all variables entered by the user, including those cast as numeric.

errors : str, default='raise'
Indicates what to do when the selected imputation_method='frequent'
and a variable has more than 1 mode.

If 'raise', raises a ValueError and stops the fit.

If 'warn', raises a UserWarning and continues, imputing using the
first most frequent category found.

If 'ignore', continues without warnings, imputing using the first
most frequent category found.

Attributes
----------
{imputer_dict_}
Expand Down Expand Up @@ -135,6 +140,7 @@ def __init__(
variables: Union[None, int, str, List[Union[str, int]]] = None,
return_object: bool = False,
ignore_format: bool = False,
errors: str = "raise",
) -> None:
if imputation_method not in ["missing", "frequent"]:
raise ValueError(
Expand All @@ -144,11 +150,18 @@ def __init__(
if not isinstance(ignore_format, bool):
raise ValueError("ignore_format takes only booleans True and False")

if errors not in ("raise", "warn", "ignore"):
raise ValueError(
"errors takes only values 'raise', 'warn', or 'ignore'. "
f"Got {errors} instead."
)

self.imputation_method = imputation_method
self.fill_value = fill_value
self.variables = _check_variables_input_value(variables)
self.return_object = return_object
self.ignore_format = ignore_format
self.errors = errors

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# Some variables may contain more than 1 mode:
if len(mode_vals) > 1:
raise ValueError(
f"The variable {var} contains multiple frequent categories."
)
if self.errors == "raise":
raise ValueError(
f"The variable {var} contains multiple "
f"frequent categories. Set errors='warn' or "
f"errors='ignore' to allow imputation using "
f"the first most frequent category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable {var} has multiple frequent "
f"categories. The first category found, "
f"{mode_vals[0]}, will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = {var: mode_vals[0]}

Expand All @@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
varnames_str = ", ".join(varnames)
else:
varnames_str = varnames[0]
raise ValueError(
f"The variable(s) {varnames_str} contain(s) multiple frequent "
f"categories."
)

if self.errors == "raise":
raise ValueError(
f"The variable(s) {varnames_str} contain(s) "
f"multiple frequent categories. Set "
f"errors='warn' or errors='ignore' to allow "
f"imputation using the first most frequent "
f"category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable(s) {varnames_str} have multiple "
f"frequent categories. The first category "
f"found will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = mode_vals.iloc[0].to_dict()

Expand Down
151 changes: 148 additions & 3 deletions tests/test_imputation/test_categorical_imputer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
import warnings

import numpy as np
import pandas as pd
import pytest

from feature_engine.imputation import CategoricalImputer


# --- Shared fixture: perfectly multimodal variable ---
@pytest.fixture
def multimodal_df():
return pd.DataFrame(
{
"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"],
"country": ["UK", "UK", "FR", "FR", "DE", "DE"],
}
)


def test_impute_with_string_missing_and_automatically_find_variables(df_na):
# set up transformer
imputer = CategoricalImputer(imputation_method="missing", variables=None)
Expand Down Expand Up @@ -150,14 +164,22 @@ def test_error_when_imputation_method_not_frequent_or_missing():


def test_error_when_variable_contains_multiple_modes(df_na):
msg = "The variable Name contains multiple frequent categories."
msg = (
"The variable Name contains multiple frequent categories. "
"Set errors='warn' or errors='ignore' to allow imputation "
"using the first most frequent category found."
)
imputer = CategoricalImputer(imputation_method="frequent", variables="Name")
with pytest.raises(ValueError) as record:
imputer.fit(df_na)
# check that error message matches
assert str(record.value) == msg

msg = "The variable(s) Name contain(s) multiple frequent categories."
msg = (
"The variable(s) Name contain(s) multiple frequent categories. "
"Set errors='warn' or errors='ignore' to allow imputation "
"using the first most frequent category found."
)
imputer = CategoricalImputer(imputation_method="frequent")
with pytest.raises(ValueError) as record:
imputer.fit(df_na)
Expand All @@ -166,7 +188,11 @@ def test_error_when_variable_contains_multiple_modes(df_na):

df_ = df_na.copy()
df_["Name_dup"] = df_["Name"]
msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories."
msg = (
"The variable(s) Name, Name_dup contain(s) multiple frequent categories. "
"Set errors='warn' or errors='ignore' to allow imputation "
"using the first most frequent category found."
)
imputer = CategoricalImputer(imputation_method="frequent")
with pytest.raises(ValueError) as record:
imputer.fit(df_)
Expand Down Expand Up @@ -305,3 +331,122 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format):

# check that error message matches
assert str(record.value) == msg


def test_errors_raise_on_multimodal_is_default(multimodal_df):
"""Default behaviour: raise ValueError on multimodal variable."""
imputer = CategoricalImputer(imputation_method="frequent")
with pytest.raises(ValueError, match="multiple frequent categories"):
imputer.fit(multimodal_df)


def test_errors_warn_emits_userwarning(multimodal_df):
"""errors='warn': UserWarning must be emitted."""
imputer = CategoricalImputer(imputation_method="frequent", errors="warn")
with pytest.warns(UserWarning, match="multiple frequent categories"):
imputer.fit(multimodal_df)


def test_errors_warn_uses_first_mode(multimodal_df):
"""errors='warn': imputer_dict_ should contain the first mode."""
imputer = CategoricalImputer(imputation_method="frequent", errors="warn")
with pytest.warns(UserWarning):
imputer.fit(multimodal_df)
expected = multimodal_df["city"].mode()[0]
assert imputer.imputer_dict_["city"] == expected


def test_errors_ignore_no_warning_raised(multimodal_df):
"""errors='ignore': no warnings should be emitted."""
imputer = CategoricalImputer(imputation_method="frequent", errors="ignore")
with warnings.catch_warnings():
warnings.simplefilter("error") # Promote all warnings to errors
imputer.fit(multimodal_df) # Should NOT raise
assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0]


def test_errors_invalid_value_raises():
"""Passing an unsupported value for errors should raise ValueError at init."""
with pytest.raises(ValueError, match="errors takes only values"):
CategoricalImputer(imputation_method="frequent", errors="bad_value")


def test_errors_param_ignored_when_imputation_method_is_missing():
"""errors param has no effect for imputation_method='missing'."""
df = pd.DataFrame({"city": ["London", np.nan, "Paris"]})
imputer = CategoricalImputer(imputation_method="missing", errors="warn")
# Should fit without warnings since there's no mode computation
with warnings.catch_warnings():
warnings.simplefilter("error")
imputer.fit(df)


def test_errors_ignore_single_variable():
"""errors='ignore' on single multimodal variable — silent, uses first mode."""
X = pd.DataFrame(
{"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]}
)
imputer = CategoricalImputer(imputation_method="frequent", errors="ignore")
imputer.fit(X)
assert imputer.imputer_dict_["city"] == X["city"].mode()[0]


def test_errors_ignore_multiple_variables():
"""errors='ignore' on multiple multimodal variables — silent, uses first mode."""
X = pd.DataFrame(
{
"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"],
"country": ["UK", "UK", "FR", "FR", "DE", "DE"],
}
)
imputer = CategoricalImputer(imputation_method="frequent", errors="ignore")
imputer.fit(X)
assert imputer.imputer_dict_["city"] == X["city"].mode()[0]
assert imputer.imputer_dict_["country"] == X["country"].mode()[0]


# =============================================================================
# NEW TESTS — added to fix codecov patch coverage (1 missing + 1 partial line)
# =============================================================================

def test_errors_warn_single_variable_emits_userwarning():
"""
Covers the warnings.warn() inside the SINGLE-VARIABLE block of fit().

The existing test_errors_warn_emits_userwarning uses multimodal_df (2 columns),
which goes through the multi-variable code path. This test uses variables='city'
(a single variable) to hit the separate single-variable warn branch.
"""
X = pd.DataFrame(
{"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]}
)
imputer = CategoricalImputer(
imputation_method="frequent", variables="city", errors="warn"
)
with pytest.warns(UserWarning, match="multiple frequent categories"):
imputer.fit(X)
# First mode is used
assert imputer.imputer_dict_["city"] == X["city"].mode()[0]


def test_errors_raise_one_multimodal_among_multiple_variables():
"""
Covers the `varnames_str = varnames[0]` else-branch in the MULTI-VARIABLE block.

This branch is reached when multiple variables are selected but only ONE of them
turns out to have multiple modes. The existing tests either raise on all-multimodal
datasets (len(varnames) > 1) or use errors='ignore'/'warn' (skipping the raise).
Here we select two variables where only 'city' is multimodal, triggering the
singular else-branch before the ValueError is raised.
"""
X = pd.DataFrame(
{
# 'city': 3 equally frequent values → multimodal
"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"],
# 'country': clear single mode (UK appears 3×, others once)
"country": ["UK", "UK", "UK", "FR", "DE", "SE"],
}
)
imputer = CategoricalImputer(imputation_method="frequent", errors="raise")
with pytest.raises(ValueError, match="city"):
imputer.fit(X)