Handle unresponsive BMC during Firmware Updates

When doing firmware updates for BMC, we saw cases where Ironic wouldn't
be able to contact the BMC, marking the node in a failed state because
of it.
This patch adds a configuration option that tell for how long ironic
should wait before proceeding with the reboot to finish the update.
We will attempt to improve the waiting time in a follow-up, trying
to identify when the bmc was unresponsive and when it was back.

Closes-Bug: #2092398

Change-Id: I53ffc8a06d5af8b0751553c3d4a9bb1c000027ae
Signed-off-by: Iury Gregory Melo Ferreira <imelofer@redhat.com>
This commit is contained in:
Iury Gregory Melo Ferreira
2024-12-20 00:16:08 -03:00
parent d4b2ce44fc
commit 2411779ee8
6 changed files with 143 additions and 1 deletions

View File

@@ -90,6 +90,11 @@ opts = [
default=60,
help=_('Number of seconds to wait between checking for '
'failed firmware update tasks')),
cfg.IntOpt('firmware_update_wait_unresponsive_bmc',
min=0,
default=300,
help=_('Number of seconds to wait before proceeding with the '
'reboot to finish the BMC firmware update step')),
cfg.StrOpt('firmware_source',
choices=[('http', _('If firmware source URL is also HTTP, then '
'serve from original location, otherwise '

View File

@@ -11,6 +11,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import time
from urllib.parse import urlparse
from oslo_log import log
@@ -232,6 +233,27 @@ class RedfishFirmware(base.FirmwareInterface):
{'node': node.uuid, 'error': e.message})
raise exception.RedfishError(error=e)
# NOTE(iurygregory): In case we are doing firmware updates we need to
# account for unresponsive BMC, in this case we wait for a set of
# minutes before proceeding to the power actions.
# In case the node has firmware_update_unresponsive_bmc_wait set we
# give priority over the configuration option.
wait_unres_bmc = (
node.driver_info.get('firmware_update_unresponsive_bmc_wait')
or CONF.redfish.firmware_update_wait_unresponsive_bmc
)
LOG.debug('BMC firmware update in progress. Waiting %(wait_time)s '
'seconds before proceeding to reboot the node %(node_uuid)s '
'to complete the step', {'node_uuid': node.uuid,
'wait_time': wait_unres_bmc})
# TODO(iurygregory): Improve the logic here to identify if the BMC
# is back, so we don't have to unconditionally wait.
# The wait_unres_bmc will be the maximum time to wait.
time.sleep(wait_unres_bmc)
LOG.debug('Wait completed. Proceeding to reboot the node '
'%(node_uuid)s to complete the step.',
{'node_uuid': node.uuid})
fw_upd['task_monitor'] = task_monitor.task_monitor_uri
node.set_driver_internal_info('redfish_fw_updates', settings)

View File

@@ -71,7 +71,12 @@ OPTIONAL_PROPERTIES = {
'redfish_auth_type': _('Redfish HTTP client authentication method. Can be '
'"basic", "session" or "auto". If not set, the '
'default value is taken from Ironic '
'configuration as ``[redfish]auth_type`` option.')
'configuration as ``[redfish]auth_type`` option.'),
'firmware_update_unresponsive_bmc_wait': _(
'Number of seconds to wait to avoid the BMC becoming unresponsive '
'during firmware updates. If not set, the default value is taken from '
'the Ironic configuration ``firmware_update_wait_unresponsive_bmc`` '
'in the ``[redfish]`` section.')
}
COMMON_PROPERTIES = REQUIRED_PROPERTIES.copy()
@@ -193,6 +198,10 @@ def parse_driver_info(node):
if root_prefix:
sushy_params['root_prefix'] = root_prefix
unres_bmc_wait = driver_info.get('firmware_update_unresponsive_bmc_wait')
if unres_bmc_wait:
sushy_params['firmware_update_unresponsive_bmc_wait'] = unres_bmc_wait
return sushy_params

View File

@@ -13,8 +13,10 @@
import datetime
import json
import time
from unittest import mock
from oslo_config import cfg
from oslo_utils import timeutils
import sushy
@@ -352,6 +354,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
self.node.save()
self._test_invalid_settings_service()
@mock.patch.object(time, 'sleep', lambda seconds: None)
def _generate_new_driver_internal_info(self, components=[], invalid=False,
add_wait=False, wait=1):
bmc_component = {'component': 'bmc', 'url': 'https://bmc/v1.0.1'}
@@ -795,6 +798,8 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
def test_continue_updates_more_updates(self, get_system_collection_mock,
node_power_action_mock,
log_mock):
cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 0,
'redfish')
self._generate_new_driver_internal_info(['bmc', 'bios'])
task_monitor_mock = mock.Mock()
@@ -829,6 +834,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
@mock.patch.object(time, 'sleep', lambda seconds: None)
def test__execute_firmware_update_no_targets(self,
get_system_collection_mock,
system_mock):
@@ -855,6 +861,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
@mock.patch.object(time, 'sleep', lambda seconds: None)
def test__execute_firmware_update_targets(self,
get_system_collection_mock,
system_mock):
@@ -878,3 +885,79 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
settings)
update_service_mock.simple_update.assert_called_once_with(
'https://bios/v1.0.1', targets=[mock.ANY])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock,
get_sys_collec_mock,
system_mock):
cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 1,
'redfish')
self._generate_new_driver_internal_info(['bmc'])
with open(
'ironic/tests/json_samples/systems_collection_single.json'
) as f:
resp_obj = json.load(f)
system_collection_mock = mock.MagicMock()
system_collection_mock.get_members.return_value = resp_obj['Members']
get_sys_collec_mock.return_value = system_collection_mock
task_monitor_mock = mock.Mock()
task_monitor_mock.task_monitor_uri = '/task/2'
update_service_mock = mock.Mock()
update_service_mock.simple_update.return_value = task_monitor_mock
firmware = redfish_fw.RedfishFirmware()
settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}]
firmware._execute_firmware_update(self.node, update_service_mock,
settings)
update_service_mock.simple_update.assert_called_once_with(
'https://bmc/v1.2.3')
sleep_mock.assert_called_once_with(
CONF.redfish.firmware_update_wait_unresponsive_bmc)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
def test__execute_firmware_update_unresponsive_bmc_node_override(
self, sleep_mock, get_sys_collec_mock, system_mock):
self._generate_new_driver_internal_info(['bmc'])
# Set a specific value for firmware_update_unresponsive_bmc_wait for
# the node
with mock.patch('time.sleep', lambda x: None):
d_info = self.node.driver_info.copy()
d_info['firmware_update_unresponsive_bmc_wait'] = 1
self.node.driver_info = d_info
self.node.save()
self.assertNotEqual(
CONF.redfish.firmware_update_wait_unresponsive_bmc,
self.node.driver_info.get('firmware_update_unresponsive_bmc_wait')
)
with open(
'ironic/tests/json_samples/systems_collection_single.json'
) as f:
resp_obj = json.load(f)
system_collection_mock = mock.MagicMock()
system_collection_mock.get_members.return_value = resp_obj['Members']
get_sys_collec_mock.return_value = system_collection_mock
task_monitor_mock = mock.Mock()
task_monitor_mock.task_monitor_uri = '/task/2'
update_service_mock = mock.Mock()
update_service_mock.simple_update.return_value = task_monitor_mock
firmware = redfish_fw.RedfishFirmware()
settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}]
firmware._execute_firmware_update(self.node, update_service_mock,
settings)
update_service_mock.simple_update.assert_called_once_with(
'https://bmc/v1.2.3')
sleep_mock.assert_called_once_with(
self.node.driver_info.get('firmware_update_unresponsive_bmc_wait')
)

View File

@@ -59,6 +59,14 @@ class RedfishUtilsTestCase(db_base.DbTestCase):
response = redfish_utils.parse_driver_info(self.node)
self.assertEqual(self.parsed_driver_info, response)
def test_parse_driver_info_firmware_update_unresponsive_bmc_wait_set(
self):
self.node.driver_info['firmware_update_unresponsive_bmc_wait'] = 30
self.parsed_driver_info = redfish_utils.parse_driver_info(self.node)
self.assertEqual(
self.parsed_driver_info['firmware_update_unresponsive_bmc_wait'],
30)
def test_parse_driver_info_default_scheme(self):
self.node.driver_info['redfish_address'] = 'example.com'
response = redfish_utils.parse_driver_info(self.node)
@@ -267,6 +275,7 @@ class RedfishUtilsAuthTestCase(db_base.DbTestCase):
'password': 'password',
'verify_ca': True,
'auth_type': 'auto',
'firmware_update_unresponsive_bmc_wait': 300,
'node_uuid': self.node.uuid
}
@@ -421,6 +430,7 @@ class RedfishUtilsSystemTestCase(db_base.DbTestCase):
'password': 'password',
'verify_ca': True,
'auth_type': 'auto',
'firmware_update_unresponsive_bmc_wait': 300,
'node_uuid': self.node.uuid
}

View File

@@ -0,0 +1,13 @@
---
fixes:
- |
[`Bug 2092398 <https://bugs.launchpad.net/ironic/+bug/2092398>`_]
Fixes an issue with node servicing/cleaning that caused the node to enter
into `service failed` or `clean failed` state after doing a bmc firmware
update, due to the BMC being unresponsive to requests during the update.
Now when doing a BMC update, we wait some time before proceeding with the
reboot to finish the update.
The time is configurable and can be changed via the config option
``[redfish]firmware_update_wait_unresponsive_bmc`` (default, 300 seconds)
or by setting ``firmware_update_unresponsive_bmc_wait`` in the
``driver-info``.