From 68eae0263073dfe2ec9a7c2f3ae06406506b39b1 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Sat, 6 Jun 2026 15:20:31 -0400
Subject: [PATCH 1/3] Validate fp16 dynamic loss scaling parameters are
 positive

loss_scale_window and min_loss_scale drive dynamic loss scaling but are
not validated, so invalid values silently initialize and fail later:

- loss_scale_window is used as `stable_interval % scale_window` in
  DynamicLossScaler.update_scale, so a value of 0 raises ZeroDivisionError
  during training.
- min_loss_scale is the loss-scale floor; a value <= 0 collapses dynamic
  loss scaling.

Add a Pydantic `mode="before"` field validator to DeepSpeedFP16Config that
rejects bool, non-numeric, non-finite (inf/-inf/nan), and non-positive
values for both fields, raising a clear ValidationError. This follows the
same pattern as the fp16.loss_scale validation added in #7889.

Add unit tests covering invalid values (0, -1, inf, nan, True, [], {}) and
valid values for both fields.

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 deepspeed/runtime/precision_config.py         | 20 ++++++++++++
 .../test_precision_config_dynamic_scale.py    | 31 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 tests/unit/runtime/test_precision_config_dynamic_scale.py

diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
index efec5c9d00c8..02f67113939f 100644
--- a/deepspeed/runtime/precision_config.py
+++ b/deepspeed/runtime/precision_config.py
@@ -130,6 +130,26 @@ def _validate_loss_scale(cls, v):
             raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
         return v
 
+    @field_validator("loss_scale_window", "min_loss_scale", mode="before")
+    @classmethod
+    def _validate_positive_dynamic_scale_param(cls, v, info):
+        # Both parameters drive dynamic loss scaling and must be strictly positive.
+        # loss_scale_window is used as `stable_interval % scale_window` in
+        # DynamicLossScaler.update_scale, so a value of 0 raises ZeroDivisionError,
+        # and min_loss_scale is the loss-scale floor, which collapses if <= 0.
+        name = info.field_name
+        if isinstance(v, bool):
+            raise ValueError(f"fp16.{name} must be a number, not bool")
+        try:
+            number = float(v)
+        except (TypeError, ValueError):
+            raise ValueError(f"fp16.{name} must be a number")
+        if not math.isfinite(number):
+            raise ValueError(f"fp16.{name} must be a finite number (not inf/-inf/nan)")
+        if number <= 0:
+            raise ValueError(f"fp16.{name} must be > 0")
+        return v
+
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py
new file mode 100644
index 000000000000..533996bf59d3
--- /dev/null
+++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+from pydantic import ValidationError
+
+from deepspeed.runtime.precision_config import DeepSpeedFP16Config
+
+
+@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
+@pytest.mark.parametrize("value", [0, -1, float("inf"), float("nan"), True])
+def test_fp16_dynamic_scale_rejects_invalid_values(field, value):
+    with pytest.raises(ValidationError):
+        DeepSpeedFP16Config(**{field: value})
+
+
+@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
+@pytest.mark.parametrize("value", [1, 1000, "2"])
+def test_fp16_dynamic_scale_accepts_valid_values(field, value):
+    cfg = DeepSpeedFP16Config(**{field: value})
+    assert getattr(cfg, field) > 0
+
+
+@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
+@pytest.mark.parametrize("value", [[], {}])
+def test_fp16_dynamic_scale_invalid_type_has_clear_error(field, value):
+    with pytest.raises(ValidationError) as excinfo:
+        DeepSpeedFP16Config(**{field: value})
+    assert "must be a number" in str(excinfo.value)

From 27af87920705605378ac42d358e4aa7e43325cf0 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Sat, 6 Jun 2026 15:26:36 -0400
Subject: [PATCH 2/3] Gate fp16 dynamic-scale validation on dynamic loss
 scaling

Address review: loss_scale_window and min_loss_scale only take effect when
dynamic loss scaling is active (fp16 enabled and loss_scale == 0, per
DeepSpeedEngine.dynamic_loss_scale). Validating them unconditionally rejected
otherwise-valid static-loss-scale configs that carry unused values like 0.

Replace the per-field validator with a model_validator(mode="after") that
checks loss_scale_window > 0 and min_loss_scale > 0 only when fp16 is enabled
and loss_scale == 0. Update tests to cover the static and fp16-disabled cases
where these fields are ignored.

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 deepspeed/runtime/precision_config.py         | 39 +++++++++----------
 .../test_precision_config_dynamic_scale.py    | 32 +++++++++------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
index 02f67113939f..6daaca7f89d1 100644
--- a/deepspeed/runtime/precision_config.py
+++ b/deepspeed/runtime/precision_config.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 import math
-from pydantic import field_validator
+from pydantic import field_validator, model_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .fp16.loss_scaler import (
     INITIAL_LOSS_SCALE,
@@ -130,26 +130,6 @@ def _validate_loss_scale(cls, v):
             raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
         return v
 
-    @field_validator("loss_scale_window", "min_loss_scale", mode="before")
-    @classmethod
-    def _validate_positive_dynamic_scale_param(cls, v, info):
-        # Both parameters drive dynamic loss scaling and must be strictly positive.
-        # loss_scale_window is used as `stable_interval % scale_window` in
-        # DynamicLossScaler.update_scale, so a value of 0 raises ZeroDivisionError,
-        # and min_loss_scale is the loss-scale floor, which collapses if <= 0.
-        name = info.field_name
-        if isinstance(v, bool):
-            raise ValueError(f"fp16.{name} must be a number, not bool")
-        try:
-            number = float(v)
-        except (TypeError, ValueError):
-            raise ValueError(f"fp16.{name} must be a number")
-        if not math.isfinite(number):
-            raise ValueError(f"fp16.{name} must be a finite number (not inf/-inf/nan)")
-        if number <= 0:
-            raise ValueError(f"fp16.{name} must be > 0")
-        return v
-
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
@@ -180,6 +160,23 @@ def _validate_positive_dynamic_scale_param(cls, v, info):
     Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only).
     """
 
+    @model_validator(mode="after")
+    def _validate_dynamic_loss_scale_params(self):
+        # loss_scale_window and min_loss_scale only take effect when dynamic loss
+        # scaling is active, i.e. fp16 is enabled and loss_scale == 0 (see
+        # DeepSpeedEngine.dynamic_loss_scale). Validating them otherwise would
+        # reject valid static-loss-scale configs that carry unused values.
+        if self.enabled and self.loss_scale == 0:
+            # loss_scale_window is used as `stable_interval % scale_window` in
+            # DynamicLossScaler.update_scale, so 0 raises ZeroDivisionError.
+            if self.loss_scale_window <= 0:
+                raise ValueError(
+                    "fp16.loss_scale_window must be > 0 when dynamic loss scaling is enabled (loss_scale=0)")
+            # min_loss_scale is the loss-scale floor, which collapses if <= 0.
+            if self.min_loss_scale <= 0:
+                raise ValueError("fp16.min_loss_scale must be > 0 when dynamic loss scaling is enabled (loss_scale=0)")
+        return self
+
     def initial_dynamic_scale(self):
         return 2**self.initial_scale_power
 
diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py
index 533996bf59d3..b828297d6f37 100644
--- a/tests/unit/runtime/test_precision_config_dynamic_scale.py
+++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py
@@ -10,22 +10,32 @@
 
 
 @pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
-@pytest.mark.parametrize("value", [0, -1, float("inf"), float("nan"), True])
-def test_fp16_dynamic_scale_rejects_invalid_values(field, value):
+@pytest.mark.parametrize("value", [0, -1])
+def test_fp16_dynamic_scale_rejects_nonpositive_when_dynamic(field, value):
+    # Dynamic loss scaling is active when fp16 is enabled and loss_scale == 0.
     with pytest.raises(ValidationError):
-        DeepSpeedFP16Config(**{field: value})
+        DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})
 
 
 @pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
-@pytest.mark.parametrize("value", [1, 1000, "2"])
-def test_fp16_dynamic_scale_accepts_valid_values(field, value):
-    cfg = DeepSpeedFP16Config(**{field: value})
+@pytest.mark.parametrize("value", [1, 1000])
+def test_fp16_dynamic_scale_accepts_positive_when_dynamic(field, value):
+    cfg = DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})
     assert getattr(cfg, field) > 0
 
 
 @pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
-@pytest.mark.parametrize("value", [[], {}])
-def test_fp16_dynamic_scale_invalid_type_has_clear_error(field, value):
-    with pytest.raises(ValidationError) as excinfo:
-        DeepSpeedFP16Config(**{field: value})
-    assert "must be a number" in str(excinfo.value)
+@pytest.mark.parametrize("value", [0, -1])
+def test_fp16_dynamic_scale_ignored_with_static_loss_scale(field, value):
+    # With a static loss scale (loss_scale > 0) these fields are unused, so a
+    # non-positive value must not fail config construction (compatibility).
+    cfg = DeepSpeedFP16Config(enabled=True, loss_scale=128, **{field: value})
+    assert getattr(cfg, field) == value
+
+
+@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
+@pytest.mark.parametrize("value", [0, -1])
+def test_fp16_dynamic_scale_ignored_when_fp16_disabled(field, value):
+    # When fp16 is disabled the dynamic scaling fields are unused.
+    cfg = DeepSpeedFP16Config(enabled=False, loss_scale=0, **{field: value})
+    assert getattr(cfg, field) == value

From 427695fa67c55ffbd493fee8e4ecb35ab05ed6c4 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Sun, 21 Jun 2026 12:42:14 -0400
Subject: [PATCH 3/3] Reject bool and non-finite values for fp16 dynamic-scale
 params

Pydantic coerces bool to int (True -> 1) and floats to int, so values
like loss_scale_window=True or min_loss_scale=inf would silently pass the
positivity check in _validate_dynamic_loss_scale_params. Add a before
field validator that rejects bool, non-finite, and non-numeric values
before coercion, mirroring the existing loss_scale validator. Addresses
@tohtana review feedback.

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 deepspeed/runtime/precision_config.py           | 17 +++++++++++++++++
 .../test_precision_config_dynamic_scale.py      | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
index 6daaca7f89d1..3336056c503b 100644
--- a/deepspeed/runtime/precision_config.py
+++ b/deepspeed/runtime/precision_config.py
@@ -160,6 +160,23 @@ def _validate_loss_scale(cls, v):
     Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only).
     """
 
+    @field_validator("loss_scale_window", "min_loss_scale", mode="before")
+    @classmethod
+    def _reject_non_integer_scale_params(cls, v, info):
+        # Pydantic coerces bool to int (True -> 1, False -> 0) and floats to int,
+        # so a bool or non-finite value would silently pass the positivity check
+        # in _validate_dynamic_loss_scale_params. Reject those here before coercion.
+        field = f"fp16.{info.field_name}"
+        if isinstance(v, bool):
+            raise ValueError(f"{field} must be an integer, not bool")
+        if isinstance(v, float) and not math.isfinite(v):
+            raise ValueError(f"{field} must be a finite number (not inf/-inf/nan)")
+        try:
+            int(v)
+        except (TypeError, ValueError):
+            raise ValueError(f"{field} must be an integer")
+        return v
+
     @model_validator(mode="after")
     def _validate_dynamic_loss_scale_params(self):
         # loss_scale_window and min_loss_scale only take effect when dynamic loss
diff --git a/tests/unit/runtime/test_precision_config_dynamic_scale.py b/tests/unit/runtime/test_precision_config_dynamic_scale.py
index b828297d6f37..344e3a36f11b 100644
--- a/tests/unit/runtime/test_precision_config_dynamic_scale.py
+++ b/tests/unit/runtime/test_precision_config_dynamic_scale.py
@@ -39,3 +39,20 @@ def test_fp16_dynamic_scale_ignored_when_fp16_disabled(field, value):
     # When fp16 is disabled the dynamic scaling fields are unused.
     cfg = DeepSpeedFP16Config(enabled=False, loss_scale=0, **{field: value})
     assert getattr(cfg, field) == value
+
+
+@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
+@pytest.mark.parametrize("value", [True, False])
+def test_fp16_dynamic_scale_rejects_bool(field, value):
+    # Pydantic coerces bool to int (True -> 1), which would otherwise slip past
+    # the positivity check. Bools must be rejected before coercion.
+    with pytest.raises(ValidationError):
+        DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})
+
+
+@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
+@pytest.mark.parametrize("value", [float("inf"), float("nan"), "abc", None])
+def test_fp16_dynamic_scale_rejects_non_integer(field, value):
+    # Non-finite and non-numeric values must be rejected rather than coerced.
+    with pytest.raises(ValidationError):
+        DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})