From bfaff1760d2346c12be9014a507b74a050d1ff3b Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 11 Jun 2026 15:36:18 -0400 Subject: [PATCH 1/3] Recover from protected mode on fleet start by resetting inactive partition nodes When protected mode disables a partition it sets the partition to INACTIVE but does not reset its nodes in the same iteration; that cleanup is normally done by a later clustermgtd iteration. Dynamic nodes that were powering up without a backing instance can therefore be left with a stale nodeaddr. Starting the compute fleet brought the partitions back UP without resetting these nodes, so right after start clustermgtd saw them again as bootstrap failures and sent the cluster straight back into protected mode, leaving the partitions INACTIVE. As documented, starting the fleet alone should be enough to recover from protected mode. Reset the nodes of INACTIVE partitions on fleet start, before bringing the partitions back UP. Only INACTIVE partitions are reset, so nodes of active partitions that may be running jobs are left untouched. --- CHANGELOG.md | 1 + src/common/schedulers/slurm_commands.py | 21 +++++++++++ src/slurm_plugin/fleet_status_manager.py | 7 +++- .../common/schedulers/test_slurm_commands.py | 35 +++++++++++++++++++ .../slurm_plugin/test_fleet_status_manager.py | 6 ++++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4734402a8..857a77e00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ This file is used to list changes made in each version of the aws-parallelcluste **BUG FIXES** - Fix clustermgtd failing to detect compute node bootstrap timeouts, which prevented the cluster from entering protected mode. - Fix an issue where compute nodes are incorrectly replaced when launching a large number of nodes due to eventual consistency. +- Fix an issue where starting the compute fleet may not reliably recover the cluster from protected mode. 3.15.0 ------ diff --git a/src/common/schedulers/slurm_commands.py b/src/common/schedulers/slurm_commands.py index 3404e1aca..63fb71c11 100644 --- a/src/common/schedulers/slurm_commands.py +++ b/src/common/schedulers/slurm_commands.py @@ -221,6 +221,27 @@ def update_all_partitions(state, reset_node_addrs_hostname): return False +def reset_nodes_in_inactive_partitions(): + """ + Reset nodeaddr/nodehostname of the nodes belonging to INACTIVE partitions and set them back to idle. + + This is meant to be called when starting the compute fleet, before bringing partitions back UP, to clean up nodes + that an INACTIVE partition may have left behind (e.g. dynamic nodes that were powering up without a backing + instance when protected mode disabled the partition). If not reset, those nodes would be detected as bootstrap + failures right after start and send the cluster back into protected mode. Only INACTIVE partitions are considered, + so nodes of active partitions (which may be running jobs) are not affected. + """ + try: + inactive_nodes = [ + part.nodenames for part in get_partitions_info() if PartitionStatus(part.state) == PartitionStatus.INACTIVE + ] + if inactive_nodes: + log.info("Resetting nodes of INACTIVE partitions: %s", inactive_nodes) + set_nodes_idle(",".join(inactive_nodes), reset_node_addrs_hostname=True) + except Exception as e: + log.error("Failed when resetting nodes of INACTIVE partitions with error %s", e) + + def _batch_attribute(attribute, batch_size, expected_length=None): """Parse an attribute into batches.""" if type(attribute) is str: diff --git a/src/slurm_plugin/fleet_status_manager.py b/src/slurm_plugin/fleet_status_manager.py index ae58b0508..3ac35f173 100644 --- a/src/slurm_plugin/fleet_status_manager.py +++ b/src/slurm_plugin/fleet_status_manager.py @@ -19,7 +19,11 @@ from logging.config import fileConfig from botocore.config import Config -from common.schedulers.slurm_commands import resume_powering_down_nodes, update_all_partitions +from common.schedulers.slurm_commands import ( + reset_nodes_in_inactive_partitions, + resume_powering_down_nodes, + update_all_partitions, +) from slurm_plugin.clustermgtd import ComputeFleetStatus, ComputeFleetStatusManager from slurm_plugin.common import log_exception from slurm_plugin.instance_manager import InstanceManager @@ -91,6 +95,7 @@ def _manage_fleet_status_transition(config, computefleet_status_data_path): def _start_partitions(): log.info("Setting slurm partitions to UP and resuming nodes...") + reset_nodes_in_inactive_partitions() update_all_partitions(PartitionStatus.UP, reset_node_addrs_hostname=False) # TODO: This function was added due to Slurm ticket 12915. The bug is not reproducible and the ticket was then # closed. This operation may now be useless: we need to check this. diff --git a/tests/common/schedulers/test_slurm_commands.py b/tests/common/schedulers/test_slurm_commands.py index 083026f01..b748fcbdd 100644 --- a/tests/common/schedulers/test_slurm_commands.py +++ b/tests/common/schedulers/test_slurm_commands.py @@ -29,6 +29,7 @@ get_nodes_info, is_static_node, parse_nodename, + reset_nodes_in_inactive_partitions, resume_powering_down_nodes, set_nodes_down, set_nodes_drain, @@ -976,6 +977,40 @@ def test_update_all_partitions( update_partitions_spy.assert_called_with(partitions_to_update, state) +@pytest.mark.parametrize( + ("mock_partitions", "expected_idle_call"), + [ + # Only INACTIVE partitions are reset; the UP partition's nodes are left untouched. + ( + [ + SlurmPartition("part-1", "node-1,node-2", "INACTIVE"), + SlurmPartition("part-2", "node-3,node-4", "UP"), + SlurmPartition("part-3", "node-5,node-6", "INACTIVE"), + ], + call("node-1,node-2,node-5,node-6", reset_node_addrs_hostname=True), + ), + # No INACTIVE partition: nothing is reset. + ( + [ + SlurmPartition("part-1", "node-1,node-2", "UP"), + SlurmPartition("part-2", "node-3,node-4", "UP"), + ], + None, + ), + ], +) +def test_reset_nodes_in_inactive_partitions(mock_partitions, expected_idle_call, mocker): + set_nodes_idle_spy = mocker.patch("common.schedulers.slurm_commands.set_nodes_idle", autospec=True) + mocker.patch("common.schedulers.slurm_commands.get_partitions_info", return_value=mock_partitions, autospec=True) + + reset_nodes_in_inactive_partitions() + + if expected_idle_call: + set_nodes_idle_spy.assert_has_calls([expected_idle_call]) + else: + set_nodes_idle_spy.assert_not_called() + + def test_resume_powering_down_nodes(mocker): get_slurm_nodes_mocked = mocker.patch("common.schedulers.slurm_commands._get_slurm_nodes", autospec=True) update_nodes_mocked = mocker.patch("common.schedulers.slurm_commands.update_nodes", autospec=True) diff --git a/tests/slurm_plugin/test_fleet_status_manager.py b/tests/slurm_plugin/test_fleet_status_manager.py index ce272341d..24d63500c 100644 --- a/tests/slurm_plugin/test_fleet_status_manager.py +++ b/tests/slurm_plugin/test_fleet_status_manager.py @@ -130,11 +130,17 @@ def test_get_computefleet_status(test_datadir, config_file, expected_status): def test_start_partitions(mocker): + reset_nodes_in_inactive_partitions_mocked = mocker.patch( + "slurm_plugin.fleet_status_manager.reset_nodes_in_inactive_partitions" + ) update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions") resume_powering_down_nodes_mocked = mocker.patch("slurm_plugin.fleet_status_manager.resume_powering_down_nodes") _start_partitions() + # INACTIVE partition nodes must be reset before partitions are brought back UP, so that nodes left behind by + # protected mode do not re-trigger protected mode right after start. + reset_nodes_in_inactive_partitions_mocked.assert_called_once() update_all_partitions_mocked.assert_called_once_with(PartitionStatus.UP, reset_node_addrs_hostname=False) resume_powering_down_nodes_mocked.assert_called_once() From c33aa5d5b79947c388eaa9a3b80732b36f4bafec Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Fri, 12 Jun 2026 16:05:54 -0400 Subject: [PATCH 2/3] Address review comments: reset inactive partition nodes to down with per-node filtering Set the nodes to 'down' instead of 'resume': while a partition is INACTIVE, Slurm does not seem to apply power-state transitions (clustermgtd's inactive cleanup uses 'down' for the same reason). Once the partitions are brought back UP, clustermgtd reconciles these down nodes to powered_down. Filter the nodes through needs_reset_when_inactive() instead of resetting all nodes of the inactive partitions, and skip partitions with no nodes to avoid building a malformed node list. Also assert in unit tests that the reset runs before partitions are brought back UP. --- src/common/schedulers/slurm_commands.py | 26 ++++++-- .../common/schedulers/test_slurm_commands.py | 62 ++++++++++++++----- .../slurm_plugin/test_fleet_status_manager.py | 29 ++++++--- 3 files changed, 88 insertions(+), 29 deletions(-) diff --git a/src/common/schedulers/slurm_commands.py b/src/common/schedulers/slurm_commands.py index 63fb71c11..136f0f1be 100644 --- a/src/common/schedulers/slurm_commands.py +++ b/src/common/schedulers/slurm_commands.py @@ -223,7 +223,7 @@ def update_all_partitions(state, reset_node_addrs_hostname): def reset_nodes_in_inactive_partitions(): """ - Reset nodeaddr/nodehostname of the nodes belonging to INACTIVE partitions and set them back to idle. + Reset nodeaddr/nodehostname and set to down the nodes belonging to INACTIVE partitions. This is meant to be called when starting the compute fleet, before bringing partitions back UP, to clean up nodes that an INACTIVE partition may have left behind (e.g. dynamic nodes that were powering up without a backing @@ -232,12 +232,26 @@ def reset_nodes_in_inactive_partitions(): so nodes of active partitions (which may be running jobs) are not affected. """ try: - inactive_nodes = [ - part.nodenames for part in get_partitions_info() if PartitionStatus(part.state) == PartitionStatus.INACTIVE + # Collect the node names of INACTIVE partitions, skipping partitions with no nodes to avoid building a + # malformed (e.g. empty or with consecutive commas) node list. + inactive_partition_nodes = [ + part.nodenames + for part in get_partitions_info() + if PartitionStatus(part.state) == PartitionStatus.INACTIVE and part.nodenames ] - if inactive_nodes: - log.info("Resetting nodes of INACTIVE partitions: %s", inactive_nodes) - set_nodes_idle(",".join(inactive_nodes), reset_node_addrs_hostname=True) + if not inactive_partition_nodes: + return + # Reset only the nodes that actually need it (e.g. nodeaddr still set), mirroring clustermgtd's inactive + # cleanup. + nodes_to_reset = { + node.name for node in get_nodes_info(",".join(inactive_partition_nodes)) if node.needs_reset_when_inactive() + } + if nodes_to_reset: + log.info( + "Resetting nodeaddr/nodehostname and setting to down the following nodes of INACTIVE partitions: %s", + sorted(nodes_to_reset), + ) + reset_nodes(nodes_to_reset, state="down", reason="inactive partition", raise_on_error=False) except Exception as e: log.error("Failed when resetting nodes of INACTIVE partitions with error %s", e) diff --git a/tests/common/schedulers/test_slurm_commands.py b/tests/common/schedulers/test_slurm_commands.py index b748fcbdd..f27c59fd7 100644 --- a/tests/common/schedulers/test_slurm_commands.py +++ b/tests/common/schedulers/test_slurm_commands.py @@ -978,37 +978,71 @@ def test_update_all_partitions( @pytest.mark.parametrize( - ("mock_partitions", "expected_idle_call"), + ("mock_partitions", "expected_get_nodes_arg", "inactive_nodes_info", "expected_reset_nodes"), [ - # Only INACTIVE partitions are reset; the UP partition's nodes are left untouched. + # Only INACTIVE partitions are considered; the UP partition's nodes are not queried/reset. Among the INACTIVE + # nodes, only those needing reset (nodeaddr still set) are reset. ( [ - SlurmPartition("part-1", "node-1,node-2", "INACTIVE"), - SlurmPartition("part-2", "node-3,node-4", "UP"), - SlurmPartition("part-3", "node-5,node-6", "INACTIVE"), + SlurmPartition("queue1", "queue1-dy-c5xlarge-1,queue1-dy-c5xlarge-2", "INACTIVE"), + SlurmPartition("queue2", "queue2-dy-c5xlarge-1", "UP"), + SlurmPartition("queue3", "queue3-dy-c5xlarge-1", "INACTIVE"), + ], + "queue1-dy-c5xlarge-1,queue1-dy-c5xlarge-2,queue3-dy-c5xlarge-1", + [ + # nodeaddr set (dirty) -> needs reset + DynamicNode("queue1-dy-c5xlarge-1", "1.2.3.4", "1.2.3.4", "IDLE+CLOUD+POWERING_UP", "queue1"), + # nodeaddr already equal to name and powered down (clean) -> does not need reset + DynamicNode( + "queue1-dy-c5xlarge-2", "queue1-dy-c5xlarge-2", "queue1-dy-c5xlarge-2", "DOWN+CLOUD", "queue1" + ), + DynamicNode("queue3-dy-c5xlarge-1", "1.2.3.5", "1.2.3.5", "IDLE+CLOUD+POWERING_UP", "queue3"), ], - call("node-1,node-2,node-5,node-6", reset_node_addrs_hostname=True), + {"queue1-dy-c5xlarge-1", "queue3-dy-c5xlarge-1"}, ), - # No INACTIVE partition: nothing is reset. + # No INACTIVE partition: get_nodes_info / reset_nodes are not called at all. ( [ - SlurmPartition("part-1", "node-1,node-2", "UP"), - SlurmPartition("part-2", "node-3,node-4", "UP"), + SlurmPartition("queue1", "queue1-dy-c5xlarge-1", "UP"), + SlurmPartition("queue2", "queue2-dy-c5xlarge-1", "UP"), + ], + None, + [], + None, + ), + # INACTIVE partition with no nodes: skipped so the node list is never malformed, nothing is queried/reset. + ( + [ + SlurmPartition("queue1", "", "INACTIVE"), ], None, + [], + None, ), ], ) -def test_reset_nodes_in_inactive_partitions(mock_partitions, expected_idle_call, mocker): - set_nodes_idle_spy = mocker.patch("common.schedulers.slurm_commands.set_nodes_idle", autospec=True) +def test_reset_nodes_in_inactive_partitions( + mock_partitions, expected_get_nodes_arg, inactive_nodes_info, expected_reset_nodes, mocker +): + reset_nodes_spy = mocker.patch("common.schedulers.slurm_commands.reset_nodes", autospec=True) + get_nodes_info_spy = mocker.patch( + "common.schedulers.slurm_commands.get_nodes_info", return_value=inactive_nodes_info, autospec=True + ) mocker.patch("common.schedulers.slurm_commands.get_partitions_info", return_value=mock_partitions, autospec=True) reset_nodes_in_inactive_partitions() - if expected_idle_call: - set_nodes_idle_spy.assert_has_calls([expected_idle_call]) + if expected_get_nodes_arg is None: + get_nodes_info_spy.assert_not_called() + reset_nodes_spy.assert_not_called() else: - set_nodes_idle_spy.assert_not_called() + get_nodes_info_spy.assert_called_once_with(expected_get_nodes_arg) + if expected_reset_nodes: + reset_nodes_spy.assert_called_once_with( + expected_reset_nodes, state="down", reason="inactive partition", raise_on_error=False + ) + else: + reset_nodes_spy.assert_not_called() def test_resume_powering_down_nodes(mocker): diff --git a/tests/slurm_plugin/test_fleet_status_manager.py b/tests/slurm_plugin/test_fleet_status_manager.py index 24d63500c..7f5093849 100644 --- a/tests/slurm_plugin/test_fleet_status_manager.py +++ b/tests/slurm_plugin/test_fleet_status_manager.py @@ -130,19 +130,30 @@ def test_get_computefleet_status(test_datadir, config_file, expected_status): def test_start_partitions(mocker): - reset_nodes_in_inactive_partitions_mocked = mocker.patch( - "slurm_plugin.fleet_status_manager.reset_nodes_in_inactive_partitions" + # Attach all mocks to a single manager so we can assert the order in which they are called. + manager = mocker.MagicMock() + manager.attach_mock( + mocker.patch("slurm_plugin.fleet_status_manager.reset_nodes_in_inactive_partitions"), + "reset_nodes_in_inactive_partitions", + ) + manager.attach_mock( + mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions"), "update_all_partitions" + ) + manager.attach_mock( + mocker.patch("slurm_plugin.fleet_status_manager.resume_powering_down_nodes"), "resume_powering_down_nodes" ) - update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions") - resume_powering_down_nodes_mocked = mocker.patch("slurm_plugin.fleet_status_manager.resume_powering_down_nodes") _start_partitions() - # INACTIVE partition nodes must be reset before partitions are brought back UP, so that nodes left behind by - # protected mode do not re-trigger protected mode right after start. - reset_nodes_in_inactive_partitions_mocked.assert_called_once() - update_all_partitions_mocked.assert_called_once_with(PartitionStatus.UP, reset_node_addrs_hostname=False) - resume_powering_down_nodes_mocked.assert_called_once() + # INACTIVE partition nodes must be reset BEFORE partitions are brought back UP, otherwise nodes left behind by + # protected mode would be re-detected as bootstrap failures right after start and re-trigger protected mode. + assert_that(manager.mock_calls).is_equal_to( + [ + mocker.call.reset_nodes_in_inactive_partitions(), + mocker.call.update_all_partitions(PartitionStatus.UP, reset_node_addrs_hostname=False), + mocker.call.resume_powering_down_nodes(), + ] + ) def test_stop_partitions(mocker): From a167842d87be21957c928b7b235cd45dad5cf613 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Fri, 12 Jun 2026 17:33:23 -0400 Subject: [PATCH 3/3] Add log when there is no nodes in inactive partitions to reset for better debug --- src/common/schedulers/slurm_commands.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/common/schedulers/slurm_commands.py b/src/common/schedulers/slurm_commands.py index 136f0f1be..ee45b0a92 100644 --- a/src/common/schedulers/slurm_commands.py +++ b/src/common/schedulers/slurm_commands.py @@ -240,6 +240,7 @@ def reset_nodes_in_inactive_partitions(): if PartitionStatus(part.state) == PartitionStatus.INACTIVE and part.nodenames ] if not inactive_partition_nodes: + log.info("No nodes in INACTIVE partitions, nothing to reset.") return # Reset only the nodes that actually need it (e.g. nodeaddr still set), mirroring clustermgtd's inactive # cleanup. @@ -252,6 +253,8 @@ def reset_nodes_in_inactive_partitions(): sorted(nodes_to_reset), ) reset_nodes(nodes_to_reset, state="down", reason="inactive partition", raise_on_error=False) + else: + log.info("No nodes of INACTIVE partitions need to be reset.") except Exception as e: log.error("Failed when resetting nodes of INACTIVE partitions with error %s", e)