From 2411779ee847f68e4e73f373d8e79e264f9c2d8b Mon Sep 17 00:00:00 2001 From: Iury Gregory Melo Ferreira Date: Fri, 20 Dec 2024 00:16:08 -0300 Subject: [PATCH] Handle unresponsive BMC during Firmware Updates When doing firmware updates for BMC, we saw cases where Ironic wouldn't be able to contact the BMC, marking the node in a failed state because of it. This patch adds a configuration option that tell for how long ironic should wait before proceeding with the reboot to finish the update. We will attempt to improve the waiting time in a follow-up, trying to identify when the bmc was unresponsive and when it was back. Closes-Bug: #2092398 Change-Id: I53ffc8a06d5af8b0751553c3d4a9bb1c000027ae Signed-off-by: Iury Gregory Melo Ferreira --- ironic/conf/redfish.py | 5 ++ ironic/drivers/modules/redfish/firmware.py | 22 +++++ ironic/drivers/modules/redfish/utils.py | 11 ++- .../drivers/modules/redfish/test_firmware.py | 83 +++++++++++++++++++ .../drivers/modules/redfish/test_utils.py | 10 +++ .../notes/bug-2092398-45f65c06a84d396a.yaml | 13 +++ 6 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/bug-2092398-45f65c06a84d396a.yaml diff --git a/ironic/conf/redfish.py b/ironic/conf/redfish.py index 7ef0ff4b61..6fc023b9ca 100644 --- a/ironic/conf/redfish.py +++ b/ironic/conf/redfish.py @@ -90,6 +90,11 @@ opts = [ default=60, help=_('Number of seconds to wait between checking for ' 'failed firmware update tasks')), + cfg.IntOpt('firmware_update_wait_unresponsive_bmc', + min=0, + default=300, + help=_('Number of seconds to wait before proceeding with the ' + 'reboot to finish the BMC firmware update step')), cfg.StrOpt('firmware_source', choices=[('http', _('If firmware source URL is also HTTP, then ' 'serve from original location, otherwise ' diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 2500cd54a6..e921aee579 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -11,6 +11,7 @@ # License for the specific language governing permissions and limitations # under the License. +import time from urllib.parse import urlparse from oslo_log import log @@ -232,6 +233,27 @@ class RedfishFirmware(base.FirmwareInterface): {'node': node.uuid, 'error': e.message}) raise exception.RedfishError(error=e) + # NOTE(iurygregory): In case we are doing firmware updates we need to + # account for unresponsive BMC, in this case we wait for a set of + # minutes before proceeding to the power actions. + # In case the node has firmware_update_unresponsive_bmc_wait set we + # give priority over the configuration option. + wait_unres_bmc = ( + node.driver_info.get('firmware_update_unresponsive_bmc_wait') + or CONF.redfish.firmware_update_wait_unresponsive_bmc + ) + LOG.debug('BMC firmware update in progress. Waiting %(wait_time)s ' + 'seconds before proceeding to reboot the node %(node_uuid)s ' + 'to complete the step', {'node_uuid': node.uuid, + 'wait_time': wait_unres_bmc}) + # TODO(iurygregory): Improve the logic here to identify if the BMC + # is back, so we don't have to unconditionally wait. + # The wait_unres_bmc will be the maximum time to wait. + time.sleep(wait_unres_bmc) + LOG.debug('Wait completed. Proceeding to reboot the node ' + '%(node_uuid)s to complete the step.', + {'node_uuid': node.uuid}) + fw_upd['task_monitor'] = task_monitor.task_monitor_uri node.set_driver_internal_info('redfish_fw_updates', settings) diff --git a/ironic/drivers/modules/redfish/utils.py b/ironic/drivers/modules/redfish/utils.py index b00352492e..e1c95d470f 100644 --- a/ironic/drivers/modules/redfish/utils.py +++ b/ironic/drivers/modules/redfish/utils.py @@ -71,7 +71,12 @@ OPTIONAL_PROPERTIES = { 'redfish_auth_type': _('Redfish HTTP client authentication method. Can be ' '"basic", "session" or "auto". If not set, the ' 'default value is taken from Ironic ' - 'configuration as ``[redfish]auth_type`` option.') + 'configuration as ``[redfish]auth_type`` option.'), + 'firmware_update_unresponsive_bmc_wait': _( + 'Number of seconds to wait to avoid the BMC becoming unresponsive ' + 'during firmware updates. If not set, the default value is taken from ' + 'the Ironic configuration ``firmware_update_wait_unresponsive_bmc`` ' + 'in the ``[redfish]`` section.') } COMMON_PROPERTIES = REQUIRED_PROPERTIES.copy() @@ -193,6 +198,10 @@ def parse_driver_info(node): if root_prefix: sushy_params['root_prefix'] = root_prefix + unres_bmc_wait = driver_info.get('firmware_update_unresponsive_bmc_wait') + if unres_bmc_wait: + sushy_params['firmware_update_unresponsive_bmc_wait'] = unres_bmc_wait + return sushy_params diff --git a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py index dbdb35351d..867b47b0e6 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_firmware.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_firmware.py @@ -13,8 +13,10 @@ import datetime import json +import time from unittest import mock +from oslo_config import cfg from oslo_utils import timeutils import sushy @@ -352,6 +354,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): self.node.save() self._test_invalid_settings_service() + @mock.patch.object(time, 'sleep', lambda seconds: None) def _generate_new_driver_internal_info(self, components=[], invalid=False, add_wait=False, wait=1): bmc_component = {'component': 'bmc', 'url': 'https://bmc/v1.0.1'} @@ -795,6 +798,8 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): def test_continue_updates_more_updates(self, get_system_collection_mock, node_power_action_mock, log_mock): + cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 0, + 'redfish') self._generate_new_driver_internal_info(['bmc', 'bios']) task_monitor_mock = mock.Mock() @@ -829,6 +834,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) + @mock.patch.object(time, 'sleep', lambda seconds: None) def test__execute_firmware_update_no_targets(self, get_system_collection_mock, system_mock): @@ -855,6 +861,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): @mock.patch.object(redfish_utils, 'get_system', autospec=True) @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) + @mock.patch.object(time, 'sleep', lambda seconds: None) def test__execute_firmware_update_targets(self, get_system_collection_mock, system_mock): @@ -878,3 +885,79 @@ class RedfishFirmwareTestCase(db_base.DbTestCase): settings) update_service_mock.simple_update.assert_called_once_with( 'https://bios/v1.0.1', targets=[mock.ANY]) + + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) + @mock.patch.object(time, 'sleep', autospec=True) + def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock, + get_sys_collec_mock, + system_mock): + cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 1, + 'redfish') + self._generate_new_driver_internal_info(['bmc']) + with open( + 'ironic/tests/json_samples/systems_collection_single.json' + ) as f: + resp_obj = json.load(f) + system_collection_mock = mock.MagicMock() + system_collection_mock.get_members.return_value = resp_obj['Members'] + get_sys_collec_mock.return_value = system_collection_mock + + task_monitor_mock = mock.Mock() + task_monitor_mock.task_monitor_uri = '/task/2' + update_service_mock = mock.Mock() + update_service_mock.simple_update.return_value = task_monitor_mock + + firmware = redfish_fw.RedfishFirmware() + + settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}] + firmware._execute_firmware_update(self.node, update_service_mock, + settings) + + update_service_mock.simple_update.assert_called_once_with( + 'https://bmc/v1.2.3') + sleep_mock.assert_called_once_with( + CONF.redfish.firmware_update_wait_unresponsive_bmc) + + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + @mock.patch.object(redfish_utils, 'get_system_collection', autospec=True) + @mock.patch.object(time, 'sleep', autospec=True) + def test__execute_firmware_update_unresponsive_bmc_node_override( + self, sleep_mock, get_sys_collec_mock, system_mock): + self._generate_new_driver_internal_info(['bmc']) + # Set a specific value for firmware_update_unresponsive_bmc_wait for + # the node + with mock.patch('time.sleep', lambda x: None): + d_info = self.node.driver_info.copy() + d_info['firmware_update_unresponsive_bmc_wait'] = 1 + self.node.driver_info = d_info + self.node.save() + + self.assertNotEqual( + CONF.redfish.firmware_update_wait_unresponsive_bmc, + self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') + ) + + with open( + 'ironic/tests/json_samples/systems_collection_single.json' + ) as f: + resp_obj = json.load(f) + system_collection_mock = mock.MagicMock() + system_collection_mock.get_members.return_value = resp_obj['Members'] + get_sys_collec_mock.return_value = system_collection_mock + + task_monitor_mock = mock.Mock() + task_monitor_mock.task_monitor_uri = '/task/2' + update_service_mock = mock.Mock() + update_service_mock.simple_update.return_value = task_monitor_mock + + firmware = redfish_fw.RedfishFirmware() + settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}] + firmware._execute_firmware_update(self.node, update_service_mock, + settings) + + update_service_mock.simple_update.assert_called_once_with( + 'https://bmc/v1.2.3') + sleep_mock.assert_called_once_with( + self.node.driver_info.get('firmware_update_unresponsive_bmc_wait') + ) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_utils.py b/ironic/tests/unit/drivers/modules/redfish/test_utils.py index 4d2597e1c0..c8a694593f 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_utils.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_utils.py @@ -59,6 +59,14 @@ class RedfishUtilsTestCase(db_base.DbTestCase): response = redfish_utils.parse_driver_info(self.node) self.assertEqual(self.parsed_driver_info, response) + def test_parse_driver_info_firmware_update_unresponsive_bmc_wait_set( + self): + self.node.driver_info['firmware_update_unresponsive_bmc_wait'] = 30 + self.parsed_driver_info = redfish_utils.parse_driver_info(self.node) + self.assertEqual( + self.parsed_driver_info['firmware_update_unresponsive_bmc_wait'], + 30) + def test_parse_driver_info_default_scheme(self): self.node.driver_info['redfish_address'] = 'example.com' response = redfish_utils.parse_driver_info(self.node) @@ -267,6 +275,7 @@ class RedfishUtilsAuthTestCase(db_base.DbTestCase): 'password': 'password', 'verify_ca': True, 'auth_type': 'auto', + 'firmware_update_unresponsive_bmc_wait': 300, 'node_uuid': self.node.uuid } @@ -421,6 +430,7 @@ class RedfishUtilsSystemTestCase(db_base.DbTestCase): 'password': 'password', 'verify_ca': True, 'auth_type': 'auto', + 'firmware_update_unresponsive_bmc_wait': 300, 'node_uuid': self.node.uuid } diff --git a/releasenotes/notes/bug-2092398-45f65c06a84d396a.yaml b/releasenotes/notes/bug-2092398-45f65c06a84d396a.yaml new file mode 100644 index 0000000000..ec6d8de6d2 --- /dev/null +++ b/releasenotes/notes/bug-2092398-45f65c06a84d396a.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + [`Bug 2092398 `_] + Fixes an issue with node servicing/cleaning that caused the node to enter + into `service failed` or `clean failed` state after doing a bmc firmware + update, due to the BMC being unresponsive to requests during the update. + Now when doing a BMC update, we wait some time before proceeding with the + reboot to finish the update. + The time is configurable and can be changed via the config option + ``[redfish]firmware_update_wait_unresponsive_bmc`` (default, 300 seconds) + or by setting ``firmware_update_unresponsive_bmc_wait`` in the + ``driver-info``.