From f631b5bca8c02b394b7b4435596b5b2d7ad39eff Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Wed, 24 Jun 2026 09:14:38 +0900 Subject: [PATCH] [SPARK-57650][YARN][TESTS] Allow AMs to use the whole queue in BaseYarnClusterSuite YarnClusterSuite tests intermittently fail on memory-constrained CI runners with 'handle.getState().isFinal() was false' after a 3-minute timeout. The mini CapacityScheduler set up in BaseYarnClusterSuite never sets maximum-am-resource-percent, so it defaults to 0.1: the queue's total AM resource budget becomes ~10% of capacity (~1GB on CI), which is smaller than the 1-2GB AM/ driver memory the tests request. Applications then wedge in the ACCEPTED state (never activated) and the suite times out. Set maximum-am-resource-percent to 1.0 (global and root.default) so AMs can use the whole test queue and applications are always activated. Co-authored-by: Isaac --- .../apache/spark/deploy/yarn/BaseYarnClusterSuite.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala index 1f755ba5efee5..69c515d6a3812 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala @@ -103,6 +103,13 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers { yarnConf.set("yarn.scheduler.capacity.root.default.acl_submit_applications", "*") yarnConf.set("yarn.scheduler.capacity.root.default.acl_administer_queue", "*") yarnConf.setInt("yarn.scheduler.capacity.node-locality-delay", -1) + // `maximum-am-resource-percent` defaults to 0.1, which caps the queue's total AM resource + // usage to 10% of its capacity. On memory-constrained CI runners this becomes ~1GB, smaller + // than the AM/driver memory these tests request (1-2GB), so applications get stuck in the + // ACCEPTED state (never activated) and the suite times out waiting for a final state. Let + // AMs use the whole queue in tests so applications are always activated. + yarnConf.setFloat("yarn.scheduler.capacity.maximum-am-resource-percent", 1.0f) + yarnConf.setFloat("yarn.scheduler.capacity.root.default.maximum-am-resource-percent", 1.0f) // Support both IPv4 and IPv6 yarnConf.set("yarn.resourcemanager.hostname", Utils.localHostNameForURI())