diff --git a/.github/workflows/rerun-gpu-failures.yml b/.github/workflows/rerun-gpu-failures.yml
new file mode 100644
index 000000000..927dbca06
--- /dev/null
+++ b/.github/workflows/rerun-gpu-failures.yml
@@ -0,0 +1,40 @@
+name: Rerun GPU failures
+
+on:
+  workflow_run:
+    workflows: [Tests]
+    types: [completed]
+
+jobs:
+  rerun-failed:
+    if: >-
+      github.event.workflow_run.conclusion == 'failure' &&
+      github.event.workflow_run.run_attempt < 3
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+    steps:
+      - name: Re-run failed jobs (GPU health only)
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -eux
+          RUN_ID=${{ github.event.workflow_run.id }}
+
+          # Only re-run if a job failed due to a GPU health check, not a real test failure
+          FAILED_STEPS=$(gh api \
+            "repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs" \
+            --paginate \
+            --jq '.jobs[] | select(.conclusion == "failure") | .steps[] | select(.conclusion == "failure") | .name')
+
+          echo "Failed steps:"
+          echo "$FAILED_STEPS"
+
+          if echo "$FAILED_STEPS" | grep -q "GPU health check"; then
+            echo "GPU health check failure detected (attempt ${{ github.event.workflow_run.run_attempt }}), re-running failed jobs..."
+            gh api \
+              "repos/${{ github.repository }}/actions/runs/${RUN_ID}/rerun-failed-jobs" \
+              --method POST
+          else
+            echo "No GPU health check failures found, not retrying."
+          fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 839648c54..9eda600d2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -127,6 +127,27 @@ jobs:
             fi
           fi
 
+      - name: GPU health check # DO NOT CHANGE THIS NAME: used by rerun-failed job
+        if: startsWith(matrix.image, 'nvidia')
+        run: |
+          source .venv/bin/activate
+          python -c "
+          import torch, sys
+
+          assert torch.cuda.is_available(), 'FATAL: CUDA not available'
+          n = torch.cuda.device_count()
+          assert n > 67, 'FATAL: No CUDA devices found -- THIS IS AN INTENTIONAL ERROR TO TEST RETRY!'
+          print(f'CUDA devices: {n}')
+
+          for i in range(n):
+              dev = torch.device('cuda', i)
+              a = torch.randn(256, 256, device=dev)
+              b = (a @ a).sum().item()
+              print(f'  Device {i} ({torch.cuda.get_device_name(i)}): OK')
+
+          print(f'All {n} devices healthy')
+          "
+
       - name: Install Triton
         if: matrix.backend == 'tileir' || (matrix.backend == 'triton' && steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9')
         run: |
@@ -223,26 +244,6 @@ jobs:
           source .venv/bin/activate
           SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install .'[cute-cu12]'
 
-      - name: CUDA Compute Check
-        if: startsWith(matrix.image, 'nvidia')
-        run: |
-          source .venv/bin/activate
-          python -c "
-          import torch, sys
-
-          assert torch.cuda.is_available(), 'FATAL: CUDA not available'
-          n = torch.cuda.device_count()
-          assert n > 0, 'FATAL: No CUDA devices found'
-          print(f'CUDA devices: {n}')
-
-          for i in range(n):
-              dev = torch.device('cuda', i)
-              a = torch.randn(256, 256, device=dev)
-              b = (a @ a).sum().item()
-              print(f'  Device {i} ({torch.cuda.get_device_name(i)}): OK')
-
-          print(f'All {n} devices healthy')
-          "
 
       - name: Run Tests
         run: |