Add cleanwait timeout cleanup process
Previously, if a node in a cleaning state timed out, the timeout process would not purge certain items from the node's configuration which resulted in a short circuiting of the logic cleaning being retried. This was a result of the node clean_step configuration not being purged upon a timeout occuring. This change adds a wrapper method around the cleaning failure error handler to allow the _fail_if_in_state method to call the error handler, since error handler syntax is not uniform and the _fail_if_in_state cannot pass arguments. It also changes the cleaning error handler to permit the error handler to delete the node clean_step, and cleaning related driver_internal_info configuration from a node in the event the node in in CLEANFAIL state. Change-Id: I9ee5c0b385648c9b7e1d330d5d1af9b2c486a436 Closes-Bug: #1590146
This commit is contained in:
@@ -1356,13 +1356,10 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'provision_state': states.CLEANWAIT,
|
||||
'maintenance': False,
|
||||
'provisioned_before': callback_timeout}
|
||||
last_error = _("Timeout reached while cleaning the node. Please "
|
||||
"check if the ramdisk responsible for the cleaning is "
|
||||
"running on the node.")
|
||||
self._fail_if_in_state(context, filters, states.CLEANWAIT,
|
||||
'provision_updated_at',
|
||||
last_error=last_error,
|
||||
keep_target_state=True)
|
||||
keep_target_state=True,
|
||||
callback_method=utils.cleanup_cleanwait_timeout)
|
||||
|
||||
@periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
|
||||
def _sync_local_state(self, context):
|
||||
|
@@ -202,11 +202,27 @@ def provisioning_error_handler(e, node, provision_state,
|
||||
'tgt_prov_state': target_provision_state})
|
||||
|
||||
|
||||
def cleanup_cleanwait_timeout(task):
|
||||
"""Cleanup a cleaning task after timeout.
|
||||
|
||||
:param task: a TaskManager instance.
|
||||
"""
|
||||
last_error = (_("Timeout reached while cleaning the node. Please "
|
||||
"check if the ramdisk responsible for the cleaning is "
|
||||
"running on the node. Failed on step %(step)s.") %
|
||||
{'step': task.node.clean_step})
|
||||
cleaning_error_handler(task, msg=last_error,
|
||||
set_fail_state=True)
|
||||
|
||||
|
||||
def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
set_fail_state=True):
|
||||
"""Put a failed node in CLEANFAIL and maintenance."""
|
||||
node = task.node
|
||||
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
|
||||
if node.provision_state in (
|
||||
states.CLEANING,
|
||||
states.CLEANWAIT,
|
||||
states.CLEANFAIL):
|
||||
# Clear clean step, msg should already include current step
|
||||
node.clean_step = {}
|
||||
info = node.driver_internal_info
|
||||
|
@@ -1100,7 +1100,13 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANWAIT,
|
||||
target_provision_state=tgt_prov_state,
|
||||
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
|
||||
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
|
||||
clean_step={
|
||||
'interface': 'deploy',
|
||||
'step': 'erase_devices'},
|
||||
driver_internal_info={
|
||||
'cleaning_reboot': manual,
|
||||
'clean_step_index': 0})
|
||||
|
||||
self.service._check_cleanwait_timeouts(self.context)
|
||||
self._stop_service()
|
||||
@@ -1108,6 +1114,11 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
# Test that cleaning parameters have been purged in order
|
||||
# to prevent looping of the cleaning sequence
|
||||
self.assertEqual({}, node.clean_step)
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_reboot', node.driver_internal_info)
|
||||
|
||||
def test__check_cleanwait_timeouts_automated_clean(self):
|
||||
self._check_cleanwait_timeouts()
|
||||
|
@@ -572,6 +572,37 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.assertFalse(self.node.save.called)
|
||||
self.assertFalse(log_mock.warning.called)
|
||||
|
||||
@mock.patch.object(conductor_utils, 'cleaning_error_handler')
|
||||
def test_cleanup_cleanwait_timeout_handler_call(self, mock_error_handler):
|
||||
self.node.clean_step = {}
|
||||
conductor_utils.cleanup_cleanwait_timeout(self.task)
|
||||
|
||||
mock_error_handler.assert_called_once_with(
|
||||
self.task,
|
||||
msg="Timeout reached while cleaning the node. Please "
|
||||
"check if the ramdisk responsible for the cleaning is "
|
||||
"running on the node. Failed on step {}.",
|
||||
set_fail_state=True)
|
||||
|
||||
def test_cleanup_cleanwait_timeout(self):
|
||||
self.node.provision_state = states.CLEANFAIL
|
||||
target = 'baz'
|
||||
self.node.target_provision_state = target
|
||||
self.node.driver_internal_info = {}
|
||||
self.node.clean_step = {'key': 'val'}
|
||||
clean_error = ("Timeout reached while cleaning the node. Please "
|
||||
"check if the ramdisk responsible for the cleaning is "
|
||||
"running on the node. Failed on step {'key': 'val'}.")
|
||||
self.node.driver_internal_info = {
|
||||
'cleaning_reboot': True,
|
||||
'clean_step_index': 0}
|
||||
conductor_utils.cleanup_cleanwait_timeout(self.task)
|
||||
self.assertEqual({}, self.node.clean_step)
|
||||
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
|
||||
self.task.process_event.assert_called_once()
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertEqual(clean_error, self.node.maintenance_reason)
|
||||
|
||||
def test_cleaning_error_handler(self):
|
||||
self.node.provision_state = states.CLEANING
|
||||
target = 'baz'
|
||||
|
@@ -0,0 +1,7 @@
|
||||
---
|
||||
fixes:
|
||||
- A bug has been corrected where a node's current clean_step
|
||||
was not purged upon that node timing out from a CLEANWAIT state.
|
||||
Previously, this bug would prevent a user from retrying cleaning
|
||||
operations. For more information,
|
||||
see https://bugs.launchpad.net/ironic/+bug/1590146.
|
Reference in New Issue
Block a user