From 68eae0263073dfe2ec9a7c2f3ae06406506b39b1 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 6 Jun 2026 15:20:31 -0400 Subject: [PATCH 1/3] Validate fp16 dynamic loss scaling parameters are positive loss_scale_window and min_loss_scale drive dynamic loss scaling but are not validated, so invalid values silently initialize and fail later: - loss_scale_window is used as `stable_interval % scale_window` in DynamicLossScaler.update_scale, so a value of 0 raises ZeroDivisionError during training. - min_loss_scale is the loss-scale floor; a value <= 0 collapses dynamic loss scaling. Add a Pydantic `mode="before"` field validator to DeepSpeedFP16Config that rejects bool, non-numeric, non-finite (inf/-inf/nan), and non-positive values for both fields, raising a clear ValidationError. This follows the same pattern as the fp16.loss_scale validation added in #7889. Add unit tests covering invalid values (0, -1, inf, nan, True, [], {}) and valid values for both fields. Signed-off-by: Aryan --- deepspeed/runtime/precision_config.py | 20 ++++++++++++ .../test_precision_config_dynamic_scale.py | 31 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tests/unit/runtime/test_precision_config_dynamic_scale.py diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py index efec5c9d00c8..02f67113939f 100644 --- a/deepspeed/runtime/precision_config.py +++ b/deepspeed/runtime/precision_config.py @@ -130,6 +130,26 @@ def _validate_loss_scale(cls, v): raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)") return v + @field_validator("loss_scale_window", "min_loss_scale", mode="before") + @classmethod + def _validate_positive_dynamic_scale_param(cls, v, info): + # Both parameters drive dynamic loss scaling and must be strictly positive. + # loss_scale_window is used as `stable_interval % scale_window` in + # DynamicLossScaler.update_scale, so a value of 0 raises ZeroDivisionError, + # and min_loss_scale is the loss-scale floor, which collapses if <= 0. + name = info.field_name + if isinstance(v, bool): + raise ValueError(f"fp16.{name} must be a number, not bool") + try: + number = float(v) + except (TypeError, ValueError): + raise ValueError(f"fp16.{name} must be a number") + if not math.isfinite(number): + raise ValueError(f"fp16.{name} must be a finite number (not inf/-inf/nan)") + if number <= 0: + raise ValueError(f"fp16.{name} must be > 0") + return v + initial_scale_power: int = 16 """ For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}. diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py new file mode 100644 index 000000000000..533996bf59d3 --- /dev/null +++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +from pydantic import ValidationError + +from deepspeed.runtime.precision_config import DeepSpeedFP16Config + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [0, -1, float("inf"), float("nan"), True]) +def test_fp16_dynamic_scale_rejects_invalid_values(field, value): + with pytest.raises(ValidationError): + DeepSpeedFP16Config(**{field: value}) + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [1, 1000, "2"]) +def test_fp16_dynamic_scale_accepts_valid_values(field, value): + cfg = DeepSpeedFP16Config(**{field: value}) + assert getattr(cfg, field) > 0 + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [[], {}]) +def test_fp16_dynamic_scale_invalid_type_has_clear_error(field, value): + with pytest.raises(ValidationError) as excinfo: + DeepSpeedFP16Config(**{field: value}) + assert "must be a number" in str(excinfo.value) From 27af87920705605378ac42d358e4aa7e43325cf0 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 6 Jun 2026 15:26:36 -0400 Subject: [PATCH 2/3] Gate fp16 dynamic-scale validation on dynamic loss scaling Address review: loss_scale_window and min_loss_scale only take effect when dynamic loss scaling is active (fp16 enabled and loss_scale == 0, per DeepSpeedEngine.dynamic_loss_scale). Validating them unconditionally rejected otherwise-valid static-loss-scale configs that carry unused values like 0. Replace the per-field validator with a model_validator(mode="after") that checks loss_scale_window > 0 and min_loss_scale > 0 only when fp16 is enabled and loss_scale == 0. Update tests to cover the static and fp16-disabled cases where these fields are ignored. Signed-off-by: Aryan --- deepspeed/runtime/precision_config.py | 39 +++++++++---------- .../test_precision_config_dynamic_scale.py | 32 +++++++++------ 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py index 02f67113939f..6daaca7f89d1 100644 --- a/deepspeed/runtime/precision_config.py +++ b/deepspeed/runtime/precision_config.py @@ -4,7 +4,7 @@ # DeepSpeed Team import math -from pydantic import field_validator +from pydantic import field_validator, model_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel from .fp16.loss_scaler import ( INITIAL_LOSS_SCALE, @@ -130,26 +130,6 @@ def _validate_loss_scale(cls, v): raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)") return v - @field_validator("loss_scale_window", "min_loss_scale", mode="before") - @classmethod - def _validate_positive_dynamic_scale_param(cls, v, info): - # Both parameters drive dynamic loss scaling and must be strictly positive. - # loss_scale_window is used as `stable_interval % scale_window` in - # DynamicLossScaler.update_scale, so a value of 0 raises ZeroDivisionError, - # and min_loss_scale is the loss-scale floor, which collapses if <= 0. - name = info.field_name - if isinstance(v, bool): - raise ValueError(f"fp16.{name} must be a number, not bool") - try: - number = float(v) - except (TypeError, ValueError): - raise ValueError(f"fp16.{name} must be a number") - if not math.isfinite(number): - raise ValueError(f"fp16.{name} must be a finite number (not inf/-inf/nan)") - if number <= 0: - raise ValueError(f"fp16.{name} must be > 0") - return v - initial_scale_power: int = 16 """ For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}. @@ -180,6 +160,23 @@ def _validate_positive_dynamic_scale_param(cls, v, info): Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only). """ + @model_validator(mode="after") + def _validate_dynamic_loss_scale_params(self): + # loss_scale_window and min_loss_scale only take effect when dynamic loss + # scaling is active, i.e. fp16 is enabled and loss_scale == 0 (see + # DeepSpeedEngine.dynamic_loss_scale). Validating them otherwise would + # reject valid static-loss-scale configs that carry unused values. + if self.enabled and self.loss_scale == 0: + # loss_scale_window is used as `stable_interval % scale_window` in + # DynamicLossScaler.update_scale, so 0 raises ZeroDivisionError. + if self.loss_scale_window <= 0: + raise ValueError( + "fp16.loss_scale_window must be > 0 when dynamic loss scaling is enabled (loss_scale=0)") + # min_loss_scale is the loss-scale floor, which collapses if <= 0. + if self.min_loss_scale <= 0: + raise ValueError("fp16.min_loss_scale must be > 0 when dynamic loss scaling is enabled (loss_scale=0)") + return self + def initial_dynamic_scale(self): return 2**self.initial_scale_power diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py index 533996bf59d3..b828297d6f37 100644 --- a/tests/unit/runtime/test_precision_config_dynamic_scale.py +++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py @@ -10,22 +10,32 @@ @pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) -@pytest.mark.parametrize("value", [0, -1, float("inf"), float("nan"), True]) -def test_fp16_dynamic_scale_rejects_invalid_values(field, value): +@pytest.mark.parametrize("value", [0, -1]) +def test_fp16_dynamic_scale_rejects_nonpositive_when_dynamic(field, value): + # Dynamic loss scaling is active when fp16 is enabled and loss_scale == 0. with pytest.raises(ValidationError): - DeepSpeedFP16Config(**{field: value}) + DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value}) @pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) -@pytest.mark.parametrize("value", [1, 1000, "2"]) -def test_fp16_dynamic_scale_accepts_valid_values(field, value): - cfg = DeepSpeedFP16Config(**{field: value}) +@pytest.mark.parametrize("value", [1, 1000]) +def test_fp16_dynamic_scale_accepts_positive_when_dynamic(field, value): + cfg = DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value}) assert getattr(cfg, field) > 0 @pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) -@pytest.mark.parametrize("value", [[], {}]) -def test_fp16_dynamic_scale_invalid_type_has_clear_error(field, value): - with pytest.raises(ValidationError) as excinfo: - DeepSpeedFP16Config(**{field: value}) - assert "must be a number" in str(excinfo.value) +@pytest.mark.parametrize("value", [0, -1]) +def test_fp16_dynamic_scale_ignored_with_static_loss_scale(field, value): + # With a static loss scale (loss_scale > 0) these fields are unused, so a + # non-positive value must not fail config construction (compatibility). + cfg = DeepSpeedFP16Config(enabled=True, loss_scale=128, **{field: value}) + assert getattr(cfg, field) == value + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [0, -1]) +def test_fp16_dynamic_scale_ignored_when_fp16_disabled(field, value): + # When fp16 is disabled the dynamic scaling fields are unused. + cfg = DeepSpeedFP16Config(enabled=False, loss_scale=0, **{field: value}) + assert getattr(cfg, field) == value From 427695fa67c55ffbd493fee8e4ecb35ab05ed6c4 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sun, 21 Jun 2026 12:42:14 -0400 Subject: [PATCH 3/3] Reject bool and non-finite values for fp16 dynamic-scale params Pydantic coerces bool to int (True -> 1) and floats to int, so values like loss_scale_window=True or min_loss_scale=inf would silently pass the positivity check in _validate_dynamic_loss_scale_params. Add a before field validator that rejects bool, non-finite, and non-numeric values before coercion, mirroring the existing loss_scale validator. Addresses @tohtana review feedback. Signed-off-by: Aryan --- deepspeed/runtime/precision_config.py | 17 +++++++++++++++++ .../test_precision_config_dynamic_scale.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py index 6daaca7f89d1..3336056c503b 100644 --- a/deepspeed/runtime/precision_config.py +++ b/deepspeed/runtime/precision_config.py @@ -160,6 +160,23 @@ def _validate_loss_scale(cls, v): Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only). """ + @field_validator("loss_scale_window", "min_loss_scale", mode="before") + @classmethod + def _reject_non_integer_scale_params(cls, v, info): + # Pydantic coerces bool to int (True -> 1, False -> 0) and floats to int, + # so a bool or non-finite value would silently pass the positivity check + # in _validate_dynamic_loss_scale_params. Reject those here before coercion. + field = f"fp16.{info.field_name}" + if isinstance(v, bool): + raise ValueError(f"{field} must be an integer, not bool") + if isinstance(v, float) and not math.isfinite(v): + raise ValueError(f"{field} must be a finite number (not inf/-inf/nan)") + try: + int(v) + except (TypeError, ValueError): + raise ValueError(f"{field} must be an integer") + return v + @model_validator(mode="after") def _validate_dynamic_loss_scale_params(self): # loss_scale_window and min_loss_scale only take effect when dynamic loss diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py index b828297d6f37..344e3a36f11b 100644 --- a/tests/unit/runtime/test_precision_config_dynamic_scale.py +++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py @@ -39,3 +39,20 @@ def test_fp16_dynamic_scale_ignored_when_fp16_disabled(field, value): # When fp16 is disabled the dynamic scaling fields are unused. cfg = DeepSpeedFP16Config(enabled=False, loss_scale=0, **{field: value}) assert getattr(cfg, field) == value + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [True, False]) +def test_fp16_dynamic_scale_rejects_bool(field, value): + # Pydantic coerces bool to int (True -> 1), which would otherwise slip past + # the positivity check. Bools must be rejected before coercion. + with pytest.raises(ValidationError): + DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value}) + + +@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"]) +@pytest.mark.parametrize("value", [float("inf"), float("nan"), "abc", None]) +def test_fp16_dynamic_scale_rejects_non_integer(field, value): + # Non-finite and non-numeric values must be rejected rather than coerced. + with pytest.raises(ValidationError): + DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})