Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion deepspeed/runtime/precision_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# DeepSpeed Team

import math
from pydantic import field_validator
from pydantic import field_validator, model_validator
from deepspeed.runtime.config_utils import DeepSpeedConfigModel
from .fp16.loss_scaler import (
INITIAL_LOSS_SCALE,
Expand Down Expand Up @@ -160,6 +160,40 @@ def _validate_loss_scale(cls, v):
Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only).
"""

@field_validator("loss_scale_window", "min_loss_scale", mode="before")
@classmethod
def _reject_non_integer_scale_params(cls, v, info):
# Pydantic coerces bool to int (True -> 1, False -> 0) and floats to int,
# so a bool or non-finite value would silently pass the positivity check
# in _validate_dynamic_loss_scale_params. Reject those here before coercion.
field = f"fp16.{info.field_name}"
if isinstance(v, bool):
raise ValueError(f"{field} must be an integer, not bool")
if isinstance(v, float) and not math.isfinite(v):
raise ValueError(f"{field} must be a finite number (not inf/-inf/nan)")
try:
int(v)
except (TypeError, ValueError):
raise ValueError(f"{field} must be an integer")
return v

@model_validator(mode="after")
def _validate_dynamic_loss_scale_params(self):
# loss_scale_window and min_loss_scale only take effect when dynamic loss
# scaling is active, i.e. fp16 is enabled and loss_scale == 0 (see
# DeepSpeedEngine.dynamic_loss_scale). Validating them otherwise would
# reject valid static-loss-scale configs that carry unused values.
if self.enabled and self.loss_scale == 0:
# loss_scale_window is used as `stable_interval % scale_window` in
# DynamicLossScaler.update_scale, so 0 raises ZeroDivisionError.
if self.loss_scale_window <= 0:
raise ValueError(
"fp16.loss_scale_window must be > 0 when dynamic loss scaling is enabled (loss_scale=0)")
# min_loss_scale is the loss-scale floor, which collapses if <= 0.
if self.min_loss_scale <= 0:
raise ValueError("fp16.min_loss_scale must be > 0 when dynamic loss scaling is enabled (loss_scale=0)")
return self

def initial_dynamic_scale(self):
return 2**self.initial_scale_power

Expand Down
58 changes: 58 additions & 0 deletions tests/unit/runtime/test_precision_config_dynamic_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

import pytest
from pydantic import ValidationError

from deepspeed.runtime.precision_config import DeepSpeedFP16Config


@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
@pytest.mark.parametrize("value", [0, -1])
def test_fp16_dynamic_scale_rejects_nonpositive_when_dynamic(field, value):
# Dynamic loss scaling is active when fp16 is enabled and loss_scale == 0.
with pytest.raises(ValidationError):
DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})


@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
@pytest.mark.parametrize("value", [1, 1000])
def test_fp16_dynamic_scale_accepts_positive_when_dynamic(field, value):
cfg = DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})
assert getattr(cfg, field) > 0


@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
@pytest.mark.parametrize("value", [0, -1])
def test_fp16_dynamic_scale_ignored_with_static_loss_scale(field, value):
# With a static loss scale (loss_scale > 0) these fields are unused, so a
# non-positive value must not fail config construction (compatibility).
cfg = DeepSpeedFP16Config(enabled=True, loss_scale=128, **{field: value})
assert getattr(cfg, field) == value


@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
@pytest.mark.parametrize("value", [0, -1])
def test_fp16_dynamic_scale_ignored_when_fp16_disabled(field, value):
# When fp16 is disabled the dynamic scaling fields are unused.
cfg = DeepSpeedFP16Config(enabled=False, loss_scale=0, **{field: value})
assert getattr(cfg, field) == value


@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
@pytest.mark.parametrize("value", [True, False])
def test_fp16_dynamic_scale_rejects_bool(field, value):
# Pydantic coerces bool to int (True -> 1), which would otherwise slip past
# the positivity check. Bools must be rejected before coercion.
with pytest.raises(ValidationError):
DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})


@pytest.mark.parametrize("field", ["loss_scale_window", "min_loss_scale"])
@pytest.mark.parametrize("value", [float("inf"), float("nan"), "abc", None])
def test_fp16_dynamic_scale_rejects_non_integer(field, value):
# Non-finite and non-numeric values must be rejected rather than coerced.
with pytest.raises(ValidationError):
DeepSpeedFP16Config(enabled=True, loss_scale=0, **{field: value})
Loading