From 91b28bc43bc2c14b485098c64851d455614c103c Mon Sep 17 00:00:00 2001
From: Julia Kreger <juliaashleykreger@gmail.com>
Date: Wed, 14 May 2025 10:11:35 -0700
Subject: [PATCH] Fix agent get_XXX_steps retries from being treated as not
 fresh agents

It is possible that an agent is booting in an environment with firewalls
doing evil things like not closing sockets, or where a FIN-ACK never makes
it to the conductor, or whatever.

This can result in the client hanging and eventually timing out.

Ironic's agent client code automatically retries. Which is cool.

The agent records it got a command from the first attempt, and then again
from the retry. Everything goes swimmingly until Ironic goes to assess
if the agent is a "freshly booted" agent, or not. At which point, the
check logic would see the multiple "get_xxx_steps" calls in the agent
logs, and declare the agent not to be freshly booted due to the retry
attempts.

So instead, we now explicitly evaluate the results of the command in
whole to account for retires. This commit also adds additional tests
as the helper was previously only really being exercised with empty
lists in unit tests.

Closes-Bug: 2110698
Change-Id: I460751b761462dbb630368e474e207fed90f289a
---
 ironic/drivers/modules/agent_base.py          | 15 +++++---
 .../unit/drivers/modules/test_agent_base.py   | 38 +++++++++++++++++++
 ...s-with-agent-startup-aebfc36a775794c3.yaml | 11 ++++++
 3 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml

diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py
index a5cae441da..3d15cdfb3b 100644
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@@ -224,11 +224,16 @@ def _freshly_booted(commands, step_type):
     agent executed will be get_XXX_steps. For later reboots the list of
     commands will be empty.
     """
-    return (
-        not commands
-        or (len(commands) == 1
-            and commands[0]['command_name'] == 'get_%s_steps' % step_type)
-    )
+    if not commands:
+        # Empty list, most likely unit testing or immediately after a reboot.
+        return True
+    step_name = 'get_%s_steps' % step_type
+    # Make a list of resulting commands which do not match the expected
+    # get_XXX_steps command.
+    result = [x for x in commands if not x['command_name'] == step_name]
+    # If the length of the result is greater than 0, then this is not a freshly
+    # booted agent.
+    return not len(result) > 0
 
 
 def _get_completed_command(task, commands, step_type):
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py
index 62ee99cb1f..010d24f40f 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@@ -1689,3 +1689,41 @@ class StepMethodsTestCase(db_base.DbTestCase):
             response = agent_base.execute_step(
                 task, self.clean_steps['deploy'][0], 'clean')
             self.assertEqual(states.CLEANWAIT, response)
+
+
+class FreshlyBootedTestCase(db_base.DbTestCase):
+
+    def setUp(self):
+        super(FreshlyBootedTestCase, self).setUp()
+
+    def test__freshly_booted_empty_result(self):
+        commands = []
+        self.assertTrue(agent_base._freshly_booted(commands, 'deploy'))
+
+    def test__freshly_booted_single_command(self):
+        commands = [{'command_name': 'get_deploy_steps'}]
+        self.assertTrue(agent_base._freshly_booted(commands, 'deploy'))
+
+    def test__freshly_booted_single_command_mismatch(self):
+        commands = [{'command_name': 'get_service_steps'}]
+        self.assertFalse(agent_base._freshly_booted(commands, 'deploy'))
+
+    def test__freshly_booted_has_retries(self):
+        # NOTE(TheJulia): this is just an arbitrary number
+        # of retires to account for lossy/problematic networks
+        commands = [
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'}]
+        self.assertTrue(agent_base._freshly_booted(commands, 'deploy'))
+
+    def test__freshly_booted_multi_command(self):
+        commands = [
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_deploy_steps'},
+            {'command_name': 'get_service_steps'}]
+        self.assertFalse(agent_base._freshly_booted(commands, 'deploy'))
diff --git a/releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml b/releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml
new file mode 100644
index 0000000000..e358e75a68
--- /dev/null
+++ b/releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    Fixes an issue with agent startup where the workflow from the first
+    agent heartbeat interaction could fail due to a transient networking
+    issue leaving the Agent and Ironic in a state where the node cannot be
+    deployed and continues to record errors upon each additional heartbeat
+    operation. Logic to check the state of the agent has been adjusted to
+    ignore retry operations which were recorded by the agent.
+    More information on this issue can be found in
+    `bug 2110698 <https://bugs.launchpad.net/ironic/+bug/2110698>`_.