diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py index efec5c9d00c8..3336056c503b 100644 --- a/deepspeed/runtime/precision_config.py +++ b/deepspeed/runtime/precision_config.py @@ -4,7 +4,7 @@ # DeepSpeed Team import math -from pydantic import field_validator +from pydantic import field_validator, model_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel from .fp16.loss_scaler import ( INITIAL_LOSS_SCALE, @@ -160,6 +160,40 @@ def _validate_loss_scale(cls, v): Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only). """ + @field_validator("loss_scale_window", "min_loss_scale", mode="before") + @classmethod + def _reject_non_integer_scale_params(cls, v, info): + # Pydantic coerces bool to int (True -> 1, False -> 0) and floats to int, + # so a bool or non-finite value would silently pass the positivity check + # in _validate_dynamic_loss_scale_params. Reject those here before coercion. + field = f"fp16.{info.field_name}" + if isinstance(v, bool): + raise ValueError(f"{field} must be an integer, not bool") + if isinstance(v, float) and not math.isfinite(v): + raise ValueError(f"{field} must be a finite number (not inf/-inf/nan)") + try: + int(v) + except (TypeError, ValueError): + raise ValueError(f"{field} must be an integer") + return v + + @model_validator(mode="after") + def _validate_dynamic_loss_scale_params(self): + # loss_scale_window and min_loss_scale only take effect when dynamic loss + # scaling is active, i.e. fp16 is enabled and loss_scale == 0 (see + # DeepSpeedEngine.dynamic_loss_scale). Validating them otherwise would + # reject valid static-loss-scale configs that carry unused values. + if self.enabled and self.loss_scale == 0: + # loss_scale_window is used as `stable_interval % scale_window` in + # DynamicLossScaler.update_scale, so 0 raises ZeroDivisionError. + if self.loss_scale_window <= 0: + raise ValueError( + "fp16.loss_scale_window must be > 0 when dynamic loss scaling is enabled (loss_scale=0)") + # min_loss_scale is the loss-scale floor, which collapses if <= 0. + if self.min_loss_scale <= 0: + raise ValueError("fp16.min_loss_scale must be > 0 when dynamic loss scaling is enabled (loss_scale=0)") + return self + def initial_dynamic_scale(self): return 2**self.initial_scale_power diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py new file mode 100644 index 000000000000..344e3a36f11b --- /dev/null +++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +from pydantic import ValidationError + +from deepspeed.runtime.precision_config import DeepSpeedFP16Config + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [0, -1]) +def test_fp16_dynamic_scale_rejects_nonpositive_when_dynamic(field, value): + # Dynamic loss scaling is active when fp16 is enabled and loss_scale == 0. + with pytest.raises(ValidationError): + DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value}) + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [1, 1000]) +def test_fp16_dynamic_scale_accepts_positive_when_dynamic(field, value): + cfg = DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value}) + assert getattr(cfg, field) > 0 + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [0, -1]) +def test_fp16_dynamic_scale_ignored_with_static_loss_scale(field, value): + # With a static loss scale (loss_scale > 0) these fields are unused, so a + # non-positive value must not fail config construction (compatibility). + cfg = DeepSpeedFP16Config(enabled=True, loss_scale=128, **{field: value}) + assert getattr(cfg, field) == value + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [0, -1]) +def test_fp16_dynamic_scale_ignored_when_fp16_disabled(field, value): + # When fp16 is disabled the dynamic scaling fields are unused. + cfg = DeepSpeedFP16Config(enabled=False, loss_scale=0, **{field: value}) + assert getattr(cfg, field) == value + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [True, False]) +def test_fp16_dynamic_scale_rejects_bool(field, value): + # Pydantic coerces bool to int (True -> 1), which would otherwise slip past + # the positivity check. Bools must be rejected before coercion. + with pytest.raises(ValidationError): + DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value}) + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [float("inf"), float("nan"), "abc", None]) +def test_fp16_dynamic_scale_rejects_non_integer(field, value): + # Non-finite and non-numeric values must be rejected rather than coerced. + with pytest.raises(ValidationError): + DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})