Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers {
yarnConf.setFloat("yarn.scheduler.capacity.maximum-am-resource-percent", 1.0f)
yarnConf.setFloat("yarn.scheduler.capacity.root.default.maximum-am-resource-percent", 1.0f)

// Give the single mini NodeManager generous memory. By default the mini cluster advertises
// only a small amount of memory, so once the AM (~1.4GB) is running there is barely enough
// headroom left for the executors these tests request. On a busy CI runner that makes
// executor allocation slow/racy and the YarnClusterSuite apps time out waiting to finish.
// The CI hosts have plenty of RAM, so let the NM offer enough for the AM plus a few executors.
yarnConf.setInt("yarn.nodemanager.resource.memory-mb", 8192)
yarnConf.setInt("yarn.scheduler.maximum-allocation-mb", 8192)

// Support both IPv4 and IPv6
yarnConf.set("yarn.resourcemanager.hostname", Utils.localHostNameForURI())

Expand Down Expand Up @@ -292,6 +300,18 @@ abstract class BaseYarnClusterSuite extends SparkFunSuite with Matchers {
props.setProperty(k, v)
}
}

// On a busy CI runner the in-JVM mini cluster, the driver and the container JVMs all compete
// for CPU, and the driver's RPC server occasionally cannot accept a connection in time. With
// the default of 3 retries an executor that loses this race gives up and exits, which leaves
// the application unable to finish and the suite times out. Give the executor->driver
// connection a larger retry budget so a transient stall does not permanently fail the app.
// Set after the spark.* JVM properties are copied above so these values are not silently
// overridden by an inherited -Dspark.rpc.io.* flag; individual tests can still override them
// via extraConf below.
props.setProperty("spark.rpc.io.maxRetries", "10")
props.setProperty("spark.rpc.io.retryWait", "2s")

extraConf.foreach { case (k, v) => props.setProperty(k, v) }

val propsFile = File.createTempFile("spark", ".properties", tempDir)
Expand Down