From 37932542078f60959b19bc94c05a094901e73fbf Mon Sep 17 00:00:00 2001 From: Kymi808 Date: Mon, 1 Jun 2026 18:17:52 -0500 Subject: [PATCH 1/2] activation_checkpointing: default num_layers to None so the configure() assert fires The module-level default for ``num_layers`` is ``None`` (line 46), and ``configure()`` asserts ``num_layers is not None`` with the message "Must specify the number of layers with contiguous memory checkpointing" when ``CONTIGUOUS_CHECKPOINTING`` is enabled (line 1108). ``_configure_defaults()`` was initializing ``num_layers = False`` instead. ``False is not None`` evaluates to True, so the assert silently passed when ``configure(contiguous_checkpointing=True)`` was called without ``num_checkpoints``. The user then hit a cryptic ``IndexError`` deep inside ``range(num_layers)`` / ``numel * num_layers`` rather than the intended helpful assertion message. Set the default to ``None`` so the assert behaves as documented. Adds a regression test in tests/unit/runtime/activation_checkpointing/. Signed-off-by: Kymi808 --- .../activation_checkpointing/checkpointing.py | 2 +- .../test_activation_checkpointing.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index fae0148ba887..826d7c14bf06 100644 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -1019,7 +1019,7 @@ def _configure_defaults(): PARTITION_ACTIVATIONS = False CONTIGUOUS_CHECKPOINTING = False - num_layers = False + num_layers = None CPU_CHECKPOINT = False SYNCHRONIZE = False PROFILE_TIME = False diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py index dd3bcd7fb6bd..ae775461b88d 100644 --- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py +++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py @@ -309,3 +309,18 @@ def __init__(self): assert model._is_checkpointable([layers[0]]) == True # ParallelTransformerLayerPipe assert model._is_checkpointable([layers[1]]) == True # GMLPBlock assert model._is_checkpointable([layers[2]]) == False # Linear layer + + +def test_configure_with_contiguous_checkpointing_requires_num_checkpoints(): + # Regression: ``_configure_defaults`` previously initialized ``num_layers`` + # to ``False`` while the assert below uses ``is not None``; ``False is not + # None`` is True, so the missing-config assert silently passed and a + # cryptic ``IndexError`` surfaced later from ``range(num_layers)``. With + # the default switched to ``None`` (matching the module-level default), + # the helpful assert message fires at the configure() call site. + with pytest.raises(AssertionError, match="number of layers"): + deepspeed.checkpointing.configure( + mpu_=None, + partition_activations=True, + contiguous_checkpointing=True, + ) From 41f3fd17833329e1f2f059831894901e46d1921d Mon Sep 17 00:00:00 2001 From: Kymi808 Date: Mon, 1 Jun 2026 18:26:03 -0500 Subject: [PATCH 2/2] test: restore module globals around configure() assert test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review (#8041) noted that `configure()` mutates module globals (`PARTITION_ACTIVATIONS`, `CONTIGUOUS_CHECKPOINTING`, etc.) before the assertion fires, leaving them set when control unwinds. Subsequent activation-checkpointing tests sharing the same pytest worker then enter `partition_activations(..., contiguous_checkpoint=True)` and hit `range(num_layers)` with `num_layers=None`, causing order-dependent failures — exactly what surfaced as the `modal-torch-latest` CI failure. Snapshot and restore the relevant module globals around the call so the test cleans up after itself. Signed-off-by: Kymi808 --- .../test_activation_checkpointing.py | 39 ++++++++++++++++--- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py index ae775461b88d..2a8aa5c14358 100644 --- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py +++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py @@ -318,9 +318,36 @@ def test_configure_with_contiguous_checkpointing_requires_num_checkpoints(): # cryptic ``IndexError`` surfaced later from ``range(num_layers)``. With # the default switched to ``None`` (matching the module-level default), # the helpful assert message fires at the configure() call site. - with pytest.raises(AssertionError, match="number of layers"): - deepspeed.checkpointing.configure( - mpu_=None, - partition_activations=True, - contiguous_checkpointing=True, - ) + # + # ``configure()`` mutates module globals before raising, so snapshot and + # restore them around the call to avoid order-dependent failures in other + # activation-checkpointing tests sharing the same pytest worker. + cp = deepspeed.checkpointing + saved = ( + cp.PARTITION_ACTIVATIONS, + cp.CONTIGUOUS_CHECKPOINTING, + cp.num_layers, + cp.CPU_CHECKPOINT, + cp.SYNCHRONIZE, + cp.PROFILE_TIME, + cp.mpu, + cp.deepspeed_checkpointing_enabled, + ) + try: + with pytest.raises(AssertionError, match="number of layers"): + deepspeed.checkpointing.configure( + mpu_=None, + partition_activations=True, + contiguous_checkpointing=True, + ) + finally: + ( + cp.PARTITION_ACTIVATIONS, + cp.CONTIGUOUS_CHECKPOINTING, + cp.num_layers, + cp.CPU_CHECKPOINT, + cp.SYNCHRONIZE, + cp.PROFILE_TIME, + cp.mpu, + cp.deepspeed_checkpointing_enabled, + ) = saved