Don't fail when node is in CLEANFAIL state
When a timeout occurs when a node is in CLEANWAIT state, the conductor puts it into the CLEANFAIL state. However, it tries to do that twice, and our state machine doesn't support moving from a CLEANFAIL state to another state via the 'fail' verb/event. The code was changed so that it doesn't try to move it to CLEANFAIL twice, and a check is put to prevent the node from being 'failed' frome a CLEANFAIL state. Change-Id: Ieeb77dd28a5d3053588c46fe2a700b5e6ceabbd7 Story: 2004299 Task: 27855
This commit is contained in:
@@ -394,8 +394,10 @@ def cleanup_cleanwait_timeout(task):
|
|||||||
"check if the ramdisk responsible for the cleaning is "
|
"check if the ramdisk responsible for the cleaning is "
|
||||||
"running on the node. Failed on step %(step)s.") %
|
"running on the node. Failed on step %(step)s.") %
|
||||||
{'step': task.node.clean_step})
|
{'step': task.node.clean_step})
|
||||||
cleaning_error_handler(task, msg=last_error,
|
# NOTE(rloo): this is called from the periodic task for cleanwait timeouts,
|
||||||
set_fail_state=True)
|
# via the task manager's process_event(). The node has already been moved
|
||||||
|
# to CLEANFAIL, so the error handler doesn't need to set the fail state.
|
||||||
|
cleaning_error_handler(task, msg=last_error, set_fail_state=False)
|
||||||
|
|
||||||
|
|
||||||
def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||||
@@ -432,7 +434,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
|||||||
node.fault = faults.CLEAN_FAILURE
|
node.fault = faults.CLEAN_FAILURE
|
||||||
node.save()
|
node.save()
|
||||||
|
|
||||||
if set_fail_state:
|
if set_fail_state and node.provision_state != states.CLEANFAIL:
|
||||||
target_state = states.MANAGEABLE if manual_clean else None
|
target_state = states.MANAGEABLE if manual_clean else None
|
||||||
task.process_event('fail', target_state=target_state)
|
task.process_event('fail', target_state=target_state)
|
||||||
|
|
||||||
|
@@ -1335,7 +1335,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
|||||||
msg="Timeout reached while cleaning the node. Please "
|
msg="Timeout reached while cleaning the node. Please "
|
||||||
"check if the ramdisk responsible for the cleaning is "
|
"check if the ramdisk responsible for the cleaning is "
|
||||||
"running on the node. Failed on step {}.",
|
"running on the node. Failed on step {}.",
|
||||||
set_fail_state=True)
|
set_fail_state=False)
|
||||||
|
|
||||||
def test_cleanup_cleanwait_timeout(self):
|
def test_cleanup_cleanwait_timeout(self):
|
||||||
self.node.provision_state = states.CLEANFAIL
|
self.node.provision_state = states.CLEANFAIL
|
||||||
@@ -1352,17 +1352,19 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
|||||||
conductor_utils.cleanup_cleanwait_timeout(self.task)
|
conductor_utils.cleanup_cleanwait_timeout(self.task)
|
||||||
self.assertEqual({}, self.node.clean_step)
|
self.assertEqual({}, self.node.clean_step)
|
||||||
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
|
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
|
||||||
self.task.process_event.assert_called_once_with('fail',
|
self.assertFalse(self.task.process_event.called)
|
||||||
target_state=None)
|
|
||||||
self.assertTrue(self.node.maintenance)
|
self.assertTrue(self.node.maintenance)
|
||||||
self.assertEqual(clean_error, self.node.maintenance_reason)
|
self.assertEqual(clean_error, self.node.maintenance_reason)
|
||||||
self.assertEqual('clean failure', self.node.fault)
|
self.assertEqual('clean failure', self.node.fault)
|
||||||
|
|
||||||
def test_cleaning_error_handler(self):
|
def _test_cleaning_error_handler(self, prov_state=states.CLEANING):
|
||||||
self.node.provision_state = states.CLEANING
|
self.node.provision_state = prov_state
|
||||||
target = 'baz'
|
target = 'baz'
|
||||||
self.node.target_provision_state = target
|
self.node.target_provision_state = target
|
||||||
self.node.driver_internal_info = {}
|
self.node.clean_step = {'key': 'val'}
|
||||||
|
self.node.driver_internal_info = {
|
||||||
|
'cleaning_reboot': True,
|
||||||
|
'clean_step_index': 0}
|
||||||
msg = 'error bar'
|
msg = 'error bar'
|
||||||
conductor_utils.cleaning_error_handler(self.task, msg)
|
conductor_utils.cleaning_error_handler(self.task, msg)
|
||||||
self.node.save.assert_called_once_with()
|
self.node.save.assert_called_once_with()
|
||||||
@@ -1374,8 +1376,20 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
|||||||
self.assertEqual('clean failure', self.node.fault)
|
self.assertEqual('clean failure', self.node.fault)
|
||||||
driver = self.task.driver.deploy
|
driver = self.task.driver.deploy
|
||||||
driver.tear_down_cleaning.assert_called_once_with(self.task)
|
driver.tear_down_cleaning.assert_called_once_with(self.task)
|
||||||
self.task.process_event.assert_called_once_with('fail',
|
if prov_state == states.CLEANFAIL:
|
||||||
target_state=None)
|
self.assertFalse(self.task.process_event.called)
|
||||||
|
else:
|
||||||
|
self.task.process_event.assert_called_once_with('fail',
|
||||||
|
target_state=None)
|
||||||
|
|
||||||
|
def test_cleaning_error_handler(self):
|
||||||
|
self._test_cleaning_error_handler()
|
||||||
|
|
||||||
|
def test_cleaning_error_handler_cleanwait(self):
|
||||||
|
self._test_cleaning_error_handler(prov_state=states.CLEANWAIT)
|
||||||
|
|
||||||
|
def test_cleaning_error_handler_cleanfail(self):
|
||||||
|
self._test_cleaning_error_handler(prov_state=states.CLEANFAIL)
|
||||||
|
|
||||||
def test_cleaning_error_handler_manual(self):
|
def test_cleaning_error_handler_manual(self):
|
||||||
target = states.MANAGEABLE
|
target = states.MANAGEABLE
|
||||||
|
@@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes an issue with a baremetal node that times out during cleaning.
|
||||||
|
The ironic-conductor was attempting to change the node's provision state
|
||||||
|
to 'clean failed' twice, resulting in the node's ``last_error`` being set
|
||||||
|
incorrectly. This no longer happens. For more information, see
|
||||||
|
`story 2004299 <https://storyboard.openstack.org/#!/story/2004299>`_.
|
Reference in New Issue
Block a user