From 7dd83bb43522bd0c55b833b0674f1b536a56f92c Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 25 Jun 2026 18:57:34 +0900 Subject: [PATCH 1/2] [SPARK-57650][YARN][TESTS][FOLLOWUP] Reduce YarnClusterSuite flakiness from runner contention SPARK-57650 fixed the deterministic ACCEPTED-state hang in BaseYarnClusterSuite (maximum-am-resource-percent). The master Build/Java21 and Build/Java25 `yarn` lanes still go red ~50% of runs: YarnClusterSuite tests intermittently time out (`handle.getState().isFinal() was false`) because the AM/executor containers fail to connect to the driver's RPC server on localhost (Connection refused). The in-JVM mini RM+NM, the driver subprocess and the AM/executor JVMs all contend for CPU on a single CI runner, so the driver's accept loop occasionally stalls; an executor that loses the race exits after the default 3 connection retries, and the application can then never finish. Two test-only mitigations in BaseYarnClusterSuite: - Give the mini NodeManager 8GB (and matching max-allocation) so executor allocation is never starved once the ~1.4GB AM is running. - Raise the executor->driver connection retry budget (spark.rpc.io.maxRetries=10, retryWait=2s) so a transient accept stall does not permanently fail the executor. Individual tests can still override. Co-authored-by: Isaac --- .../deploy/yarn/BaseYarnClusterSuite.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala index 69c515d6a381..217790676cda 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala @@ -111,6 +111,14 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers { yarnConf.setFloat("yarn.scheduler.capacity.maximum-am-resource-percent", 1.0f) yarnConf.setFloat("yarn.scheduler.capacity.root.default.maximum-am-resource-percent", 1.0f) + // Give the single mini NodeManager generous memory. By default the mini cluster advertises + // only a small amount of memory, so once the AM (~1.4GB) is running there is barely enough + // headroom left for the executors these tests request. On a busy CI runner that makes + // executor allocation slow/racy and the YarnClusterSuite apps time out waiting to finish. + // The CI hosts have plenty of RAM, so let the NM offer enough for the AM plus a few executors. + yarnConf.setInt("yarn.nodemanager.resource.memory-mb", 8192) + yarnConf.setInt("yarn.scheduler.maximum-allocation-mb", 8192) + // Support both IPv4 and IPv6 yarnConf.set("yarn.resourcemanager.hostname", Utils.localHostNameForURI()) @@ -261,6 +269,15 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers { val props = new Properties() props.put(SPARK_JARS.key, "local:" + fakeSparkJar.getAbsolutePath()) + // On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete + // for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With + // the default of 3 retries an executor that loses this race gives up and exits, which leaves + // the application unable to finish and the suite times out. Give the executor->driver + // connection a larger retry budget so a transient stall does not permanently fail the app. + // These are defaults; individual tests can still override them via extraConf below. + props.setProperty("spark.rpc.io.maxRetries", "10") + props.setProperty("spark.rpc.io.retryWait", "2s") + val testClasspath = new TestClasspathBuilder() .buildClassPath( logConfDir.getAbsolutePath() + From bf5996dd5536601b5986b529248d35c0b9eaa43a Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 26 Jun 2026 19:57:43 +0900 Subject: [PATCH 2/2] [YARN][TESTS] Set spark.rpc.io.* defaults after copying spark.* JVM props Address review feedback: previously the rpc retry defaults were set before the loop that copies inherited spark.* JVM system properties, so a -Dspark.rpc.io.* flag would silently override them, contradicting the comment that only extraConf overrides. Move the two setProperty calls to just after the JVM-property copy loop and just before extraConf.foreach, so the defaults win over inherited flags while extraConf remains the sole override. Co-authored-by: Isaac --- .../deploy/yarn/BaseYarnClusterSuite.scala | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala index 217790676cda..5112e62d838c 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala @@ -269,15 +269,6 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers { val props = new Properties() props.put(SPARK_JARS.key, "local:" + fakeSparkJar.getAbsolutePath()) - // On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete - // for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With - // the default of 3 retries an executor that loses this race gives up and exits, which leaves - // the application unable to finish and the suite times out. Give the executor->driver - // connection a larger retry budget so a transient stall does not permanently fail the app. - // These are defaults; individual tests can still override them via extraConf below. - props.setProperty("spark.rpc.io.maxRetries", "10") - props.setProperty("spark.rpc.io.retryWait", "2s") - val testClasspath = new TestClasspathBuilder() .buildClassPath( logConfDir.getAbsolutePath() + @@ -309,6 +300,18 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers { props.setProperty(k, v) } } + + // On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete + // for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With + // the default of 3 retries an executor that loses this race gives up and exits, which leaves + // the application unable to finish and the suite times out. Give the executor->driver + // connection a larger retry budget so a transient stall does not permanently fail the app. + // Set after the spark.* JVM properties are copied above so these values are not silently + // overridden by an inherited -Dspark.rpc.io.* flag; individual tests can still override them + // via extraConf below. + props.setProperty("spark.rpc.io.maxRetries", "10") + props.setProperty("spark.rpc.io.retryWait", "2s") + extraConf.foreach { case (k, v) => props.setProperty(k, v) } val propsFile = File.createTempFile("spark", ".properties", tempDir)