diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index a5cae441da..3d15cdfb3b 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -224,11 +224,16 @@ def _freshly_booted(commands, step_type): agent executed will be get_XXX_steps. For later reboots the list of commands will be empty. """ - return ( - not commands - or (len(commands) == 1 - and commands[0]['command_name'] == 'get_%s_steps' % step_type) - ) + if not commands: + # Empty list, most likely unit testing or immediately after a reboot. + return True + step_name = 'get_%s_steps' % step_type + # Make a list of resulting commands which do not match the expected + # get_XXX_steps command. + result = [x for x in commands if not x['command_name'] == step_name] + # If the length of the result is greater than 0, then this is not a freshly + # booted agent. + return not len(result) > 0 def _get_completed_command(task, commands, step_type): diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index 62ee99cb1f..010d24f40f 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -1689,3 +1689,41 @@ class StepMethodsTestCase(db_base.DbTestCase): response = agent_base.execute_step( task, self.clean_steps['deploy'][0], 'clean') self.assertEqual(states.CLEANWAIT, response) + + +class FreshlyBootedTestCase(db_base.DbTestCase): + + def setUp(self): + super(FreshlyBootedTestCase, self).setUp() + + def test__freshly_booted_empty_result(self): + commands = [] + self.assertTrue(agent_base._freshly_booted(commands, 'deploy')) + + def test__freshly_booted_single_command(self): + commands = [{'command_name': 'get_deploy_steps'}] + self.assertTrue(agent_base._freshly_booted(commands, 'deploy')) + + def test__freshly_booted_single_command_mismatch(self): + commands = [{'command_name': 'get_service_steps'}] + self.assertFalse(agent_base._freshly_booted(commands, 'deploy')) + + def test__freshly_booted_has_retries(self): + # NOTE(TheJulia): this is just an arbitrary number + # of retires to account for lossy/problematic networks + commands = [ + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}] + self.assertTrue(agent_base._freshly_booted(commands, 'deploy')) + + def test__freshly_booted_multi_command(self): + commands = [ + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_deploy_steps'}, + {'command_name': 'get_service_steps'}] + self.assertFalse(agent_base._freshly_booted(commands, 'deploy')) diff --git a/releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml b/releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml new file mode 100644 index 0000000000..e358e75a68 --- /dev/null +++ b/releasenotes/notes/permit-retries-with-agent-startup-aebfc36a775794c3.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes an issue with agent startup where the workflow from the first + agent heartbeat interaction could fail due to a transient networking + issue leaving the Agent and Ironic in a state where the node cannot be + deployed and continues to record errors upon each additional heartbeat + operation. Logic to check the state of the agent has been adjusted to + ignore retry operations which were recorded by the agent. + More information on this issue can be found in + `bug 2110698 `_.