From 7dd83bb43522bd0c55b833b0674f1b536a56f92c Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <hyukjin.kwon@databricks.com>
Date: Thu, 25 Jun 2026 18:57:34 +0900
Subject: [PATCH 1/2] [SPARK-57650][YARN][TESTS][FOLLOWUP] Reduce
 YarnClusterSuite flakiness from runner contention

SPARK-57650 fixed the deterministic ACCEPTED-state hang in BaseYarnClusterSuite
(maximum-am-resource-percent). The master Build/Java21 and Build/Java25 `yarn`
lanes still go red ~50% of runs: YarnClusterSuite tests intermittently time out
(`handle.getState().isFinal() was false`) because the AM/executor containers fail
to connect to the driver's RPC server on localhost (Connection refused). The
in-JVM mini RM+NM, the driver subprocess and the AM/executor JVMs all contend for
CPU on a single CI runner, so the driver's accept loop occasionally stalls; an
executor that loses the race exits after the default 3 connection retries, and the
application can then never finish.

Two test-only mitigations in BaseYarnClusterSuite:
- Give the mini NodeManager 8GB (and matching max-allocation) so executor
  allocation is never starved once the ~1.4GB AM is running.
- Raise the executor->driver connection retry budget
  (spark.rpc.io.maxRetries=10, retryWait=2s) so a transient accept stall does
  not permanently fail the executor. Individual tests can still override.

Co-authored-by: Isaac
---
 .../deploy/yarn/BaseYarnClusterSuite.scala      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 69c515d6a381..217790676cda 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -111,6 +111,14 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers {
     yarnConf.setFloat("yarn.scheduler.capacity.maximum-am-resource-percent", 1.0f)
     yarnConf.setFloat("yarn.scheduler.capacity.root.default.maximum-am-resource-percent", 1.0f)
 
+    // Give the single mini NodeManager generous memory. By default the mini cluster advertises
+    // only a small amount of memory, so once the AM (~1.4GB) is running there is barely enough
+    // headroom left for the executors these tests request. On a busy CI runner that makes
+    // executor allocation slow/racy and the YarnClusterSuite apps time out waiting to finish.
+    // The CI hosts have plenty of RAM, so let the NM offer enough for the AM plus a few executors.
+    yarnConf.setInt("yarn.nodemanager.resource.memory-mb", 8192)
+    yarnConf.setInt("yarn.scheduler.maximum-allocation-mb", 8192)
+
     // Support both IPv4 and IPv6
     yarnConf.set("yarn.resourcemanager.hostname", Utils.localHostNameForURI())
 
@@ -261,6 +269,15 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers {
     val props = new Properties()
     props.put(SPARK_JARS.key, "local:" + fakeSparkJar.getAbsolutePath())
 
+    // On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete
+    // for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With
+    // the default of 3 retries an executor that loses this race gives up and exits, which leaves
+    // the application unable to finish and the suite times out. Give the executor->driver
+    // connection a larger retry budget so a transient stall does not permanently fail the app.
+    // These are defaults; individual tests can still override them via extraConf below.
+    props.setProperty("spark.rpc.io.maxRetries", "10")
+    props.setProperty("spark.rpc.io.retryWait", "2s")
+
     val testClasspath = new TestClasspathBuilder()
       .buildClassPath(
         logConfDir.getAbsolutePath() +

From bf5996dd5536601b5986b529248d35c0b9eaa43a Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <hyukjin.kwon@databricks.com>
Date: Fri, 26 Jun 2026 19:57:43 +0900
Subject: [PATCH 2/2] [YARN][TESTS] Set spark.rpc.io.* defaults after copying
 spark.* JVM props

Address review feedback: previously the rpc retry defaults were set before the
loop that copies inherited spark.* JVM system properties, so a -Dspark.rpc.io.*
flag would silently override them, contradicting the comment that only extraConf
overrides. Move the two setProperty calls to just after the JVM-property copy
loop and just before extraConf.foreach, so the defaults win over inherited flags
while extraConf remains the sole override.

Co-authored-by: Isaac
---
 .../deploy/yarn/BaseYarnClusterSuite.scala    | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 217790676cda..5112e62d838c 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -269,15 +269,6 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers {
     val props = new Properties()
     props.put(SPARK_JARS.key, "local:" + fakeSparkJar.getAbsolutePath())
 
-    // On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete
-    // for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With
-    // the default of 3 retries an executor that loses this race gives up and exits, which leaves
-    // the application unable to finish and the suite times out. Give the executor->driver
-    // connection a larger retry budget so a transient stall does not permanently fail the app.
-    // These are defaults; individual tests can still override them via extraConf below.
-    props.setProperty("spark.rpc.io.maxRetries", "10")
-    props.setProperty("spark.rpc.io.retryWait", "2s")
-
     val testClasspath = new TestClasspathBuilder()
       .buildClassPath(
         logConfDir.getAbsolutePath() +
@@ -309,6 +300,18 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers {
         props.setProperty(k, v)
       }
     }
+
+    // On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete
+    // for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With
+    // the default of 3 retries an executor that loses this race gives up and exits, which leaves
+    // the application unable to finish and the suite times out. Give the executor->driver
+    // connection a larger retry budget so a transient stall does not permanently fail the app.
+    // Set after the spark.* JVM properties are copied above so these values are not silently
+    // overridden by an inherited -Dspark.rpc.io.* flag; individual tests can still override them
+    // via extraConf below.
+    props.setProperty("spark.rpc.io.maxRetries", "10")
+    props.setProperty("spark.rpc.io.retryWait", "2s")
+
     extraConf.foreach { case (k, v) => props.setProperty(k, v) }
 
     val propsFile = File.createTempFile("spark", ".properties", tempDir)