Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ celerybeat-schedule
# Environments
.env
.venv
.venv_wsl
env/
venv/
ENV/
Expand Down
2 changes: 2 additions & 0 deletions docs/whats_new/v_190.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ New transformers
Enhancements
~~~~~~~~~~~~

- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar <https://github.com/DirekKakkar>`_)
- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar <https://github.com/DirekKakkar>`_)
- Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli <https://github.com/solegalli>`_)

BUG
Expand Down
44 changes: 26 additions & 18 deletions feature_engine/encoding/base_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,21 @@
from sklearn.utils.validation import check_is_fitted

from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._check_init_parameters.check_variables import \
_check_variables_input_value
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_missing_values_docstring,
_variables_categorical_docstring,
)
from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
_missing_values_docstring, _variables_categorical_docstring)
from feature_engine._docstrings.init_parameters.encoders import \
_ignore_format_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import (
_check_optional_contains_na,
_check_X_matches_training_df,
check_X,
)
from feature_engine.dataframe_checks import (_check_optional_contains_na,
_check_X_matches_training_df,
check_X)
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import (
check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables,
)
from feature_engine.variable_handling import (check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables)


@Substitution(
Expand Down Expand Up @@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
def _encode(self, X: pd.DataFrame) -> pd.DataFrame:
# replace categories by the learned parameters
for feature in self.encoder_dict_.keys():
# Detect unseen categories BEFORE mapping so we can name them
if self.unseen == "warn":
unseen_cats = set(X[feature].dropna().unique()) - set(
self.encoder_dict_[feature].keys()
)
if unseen_cats:
warnings.warn(
f"Variable {feature!r} contains unseen categories: "
f"{unseen_cats}. These will be encoded as NaN.",
UserWarning,
)

X[feature] = X[feature].map(self.encoder_dict_[feature])

# if original variables are cast as categorical, they will remain
Expand Down Expand Up @@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X):
"During the encoding, NaN values were introduced in the feature(s) "
f"{nan_columns_str}."
)
# 'warn': per-variable warnings were already issued in _encode before
# the mapping, so nothing more to do here.

def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Convert the encoded variable back to the original values.
Expand Down
32 changes: 12 additions & 20 deletions feature_engine/encoding/count_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,26 @@
import pandas as pd

from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
_feature_names_in_docstring, _n_features_in_docstring,
_variables_attribute_docstring)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_missing_values_docstring,
_variables_categorical_docstring,
)
_missing_values_docstring, _variables_categorical_docstring)
from feature_engine._docstrings.init_parameters.encoders import (
_ignore_format_docstring,
_unseen_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_inverse_transform_docstring,
_transform_encoders_docstring,
)
_ignore_format_docstring, _unseen_docstring)
from feature_engine._docstrings.methods import (_fit_transform_docstring,
_inverse_transform_docstring,
_transform_encoders_docstring)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.encoding._helper_functions import check_parameter_unseen
from feature_engine.encoding.base_encoder import (
CategoricalInitMixinNA,
CategoricalMethodsMixin,
)
from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA,
CategoricalMethodsMixin)

_unseen_docstring = (
_unseen_docstring
+ """ If `'encode'`, unseen categories will be encoded as 0 (zero)."""
+ """ If `'warn'`, unseen categories will be encoded as NaN and a"""
+ """ UserWarning is raised listing the unseen categories per variable."""
)


Expand Down Expand Up @@ -166,7 +158,7 @@ def __init__(
f"Got {encoding_method} instead."
)

check_parameter_unseen(unseen, ["ignore", "raise", "encode"])
check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"])
super().__init__(variables, missing_values, ignore_format)
self.encoding_method = encoding_method
self.unseen = unseen
Expand Down
86 changes: 61 additions & 25 deletions feature_engine/imputation/categorical.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._check_init_parameters.check_variables import \
_check_variables_input_value
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_imputer_dict_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_transform_imputers_docstring,
)
_feature_names_in_docstring, _imputer_dict_docstring,
_n_features_in_docstring, _variables_attribute_docstring)
from feature_engine._docstrings.methods import (_fit_transform_docstring,
_transform_imputers_docstring)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import (
check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables,
)
from feature_engine.variable_handling import (check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables)


@Substitution(
Expand Down Expand Up @@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer):
type object or categorical. If True, the imputer will select all variables or
accept all variables entered by the user, including those cast as numeric.

errors : str, default='raise'
Indicates what to do when the selected imputation_method='frequent'
and a variable has more than 1 mode.

If 'raise', raises a ValueError and stops the fit.

If 'warn', raises a UserWarning and continues, imputing using the
first most frequent category found.

If 'ignore', continues without warnings, imputing using the first
most frequent category found.

Attributes
----------
{imputer_dict_}
Expand Down Expand Up @@ -135,6 +140,7 @@ def __init__(
variables: Union[None, int, str, List[Union[str, int]]] = None,
return_object: bool = False,
ignore_format: bool = False,
errors: str = "raise",
) -> None:
if imputation_method not in ["missing", "frequent"]:
raise ValueError(
Expand All @@ -144,11 +150,18 @@ def __init__(
if not isinstance(ignore_format, bool):
raise ValueError("ignore_format takes only booleans True and False")

if errors not in ("raise", "warn", "ignore"):
raise ValueError(
"errors takes only values 'raise', 'warn', or 'ignore'. "
f"Got {errors} instead."
)

self.imputation_method = imputation_method
self.fill_value = fill_value
self.variables = _check_variables_input_value(variables)
self.return_object = return_object
self.ignore_format = ignore_format
self.errors = errors

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# Some variables may contain more than 1 mode:
if len(mode_vals) > 1:
raise ValueError(
f"The variable {var} contains multiple frequent categories."
)
if self.errors == "raise":
raise ValueError(
f"The variable {var} contains multiple "
f"frequent categories. Set errors='warn' or "
f"errors='ignore' to allow imputation using "
f"the first most frequent category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable {var} has multiple frequent "
f"categories. The first category found, "
f"{mode_vals[0]}, will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = {var: mode_vals[0]}

Expand All @@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
varnames_str = ", ".join(varnames)
else:
varnames_str = varnames[0]
raise ValueError(
f"The variable(s) {varnames_str} contain(s) multiple frequent "
f"categories."
)

if self.errors == "raise":
raise ValueError(
f"The variable(s) {varnames_str} contain(s) "
f"multiple frequent categories. Set "
f"errors='warn' or errors='ignore' to allow "
f"imputation using the first most frequent "
f"category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable(s) {varnames_str} have multiple "
f"frequent categories. The first category "
f"found will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = mode_vals.iloc[0].to_dict()

Expand Down
90 changes: 88 additions & 2 deletions tests/test_encoding/test_count_frequency_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sklearn.exceptions import NotFittedError

from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.encoding._helper_functions import check_parameter_unseen


# init parameters
Expand Down Expand Up @@ -237,7 +238,7 @@ def test_no_error_triggered_when_df_contains_unseen_categories_and_unseen_is_enc
encoder.transform(df_enc_rare)


@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"])
@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"])
def test_fit_raises_error_if_df_contains_na(errors, df_enc_na):
# test case 4: when dataset contains na, fit method
encoder = CountFrequencyEncoder(unseen=errors)
Expand All @@ -251,7 +252,7 @@ def test_fit_raises_error_if_df_contains_na(errors, df_enc_na):
assert str(record.value) == msg


@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"])
@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"])
def test_transform_raises_error_if_df_contains_na(errors, df_enc, df_enc_na):
# test case 4: when dataset contains na, transform method
encoder = CountFrequencyEncoder(unseen=errors)
Expand Down Expand Up @@ -476,3 +477,88 @@ def test_inverse_transform_raises_non_fitted_error():
# Test when fit is not called prior to transform.
with pytest.raises(NotFittedError):
enc.inverse_transform(df1)


# ---------------------------------------------------------------------------
# Tests for unseen='warn'
# ---------------------------------------------------------------------------

@pytest.fixture
def train_test_dfs_warn():
X_train = pd.DataFrame({"color": ["red", "red", "blue", "green", "blue"]})
X_test = pd.DataFrame({"color": ["red", "blue", "yellow"]}) # 'yellow' unseen
return X_train, X_test


def test_unseen_warn_emits_userwarning(train_test_dfs_warn):
"""unseen='warn': UserWarning emitted for unseen categories."""
X_train, X_test = train_test_dfs_warn
encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
encoder.fit(X_train)
with pytest.warns(UserWarning, match="unseen categories"):
encoder.transform(X_test)


def test_unseen_warn_encodes_as_nan(train_test_dfs_warn):
"""unseen='warn': unseen categories should become NaN."""
X_train, X_test = train_test_dfs_warn
encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
encoder.fit(X_train)
with pytest.warns(UserWarning):
X_tr = encoder.transform(X_test)
# 'yellow' is unseen — should be NaN
assert pd.isna(X_tr.loc[X_tr.index[2], "color"])


def test_unseen_warn_known_categories_encoded_correctly(train_test_dfs_warn):
"""unseen='warn': known categories still encoded correctly."""
X_train, X_test = train_test_dfs_warn
encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
encoder.fit(X_train)
with pytest.warns(UserWarning):
X_tr = encoder.transform(X_test)
# 'red' appears 2 times in training
assert X_tr.loc[X_tr.index[0], "color"] == 2


def test_unseen_warn_no_warning_when_no_unseen(train_test_dfs_warn):
"""unseen='warn': no warning if all categories were seen during fit."""
X_train, _ = train_test_dfs_warn
X_test_seen = pd.DataFrame({"color": ["red", "blue"]})
encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn")
encoder.fit(X_train)
with warnings.catch_warnings():
warnings.simplefilter("error") # Fail if any warning raised
encoder.transform(X_test_seen)


def test_unseen_invalid_value_raises():
"""Invalid unseen value should raise ValueError at init."""
with pytest.raises(ValueError, match="takes only values"):
CountFrequencyEncoder(unseen="bad_value")


# =============================================================================
# NEW TESTS — added to fix codecov patch coverage
# =============================================================================

def test_check_parameter_unseen_raises_when_accepted_values_is_not_a_list():
"""
Covers the first raise ValueError in check_parameter_unseen():

if not isinstance(accepted_values, list) or not all(
isinstance(item, str) for item in accepted_values
):
raise ValueError("accepted_values should be a list of strings ...")

check_parameter_unseen() is an internal helper. CountFrequencyEncoder always
calls it with a hardcoded valid list, so the guard is never triggered through
normal usage — it must be tested by importing and calling the function directly.
"""
# accepted_values is not a list at all
with pytest.raises(ValueError, match="accepted_values should be a list of strings"):
check_parameter_unseen("raise", "raise")

# accepted_values is a list but contains a non-string element
with pytest.raises(ValueError, match="accepted_values should be a list of strings"):
check_parameter_unseen("raise", ["raise", "ignore", 42])
Loading