Handle unresponsive BMC during Firmware Updates
When doing firmware updates for BMC, we saw cases where Ironic wouldn't be able to contact the BMC, marking the node in a failed state because of it. This patch adds a configuration option that tell for how long ironic should wait before proceeding with the reboot to finish the update. We will attempt to improve the waiting time in a follow-up, trying to identify when the bmc was unresponsive and when it was back. Closes-Bug: #2092398 Change-Id: I53ffc8a06d5af8b0751553c3d4a9bb1c000027ae Signed-off-by: Iury Gregory Melo Ferreira <imelofer@redhat.com>
This commit is contained in:
@@ -90,6 +90,11 @@ opts = [
|
||||
default=60,
|
||||
help=_('Number of seconds to wait between checking for '
|
||||
'failed firmware update tasks')),
|
||||
cfg.IntOpt('firmware_update_wait_unresponsive_bmc',
|
||||
min=0,
|
||||
default=300,
|
||||
help=_('Number of seconds to wait before proceeding with the '
|
||||
'reboot to finish the BMC firmware update step')),
|
||||
cfg.StrOpt('firmware_source',
|
||||
choices=[('http', _('If firmware source URL is also HTTP, then '
|
||||
'serve from original location, otherwise '
|
||||
|
@@ -11,6 +11,7 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from oslo_log import log
|
||||
@@ -232,6 +233,27 @@ class RedfishFirmware(base.FirmwareInterface):
|
||||
{'node': node.uuid, 'error': e.message})
|
||||
raise exception.RedfishError(error=e)
|
||||
|
||||
# NOTE(iurygregory): In case we are doing firmware updates we need to
|
||||
# account for unresponsive BMC, in this case we wait for a set of
|
||||
# minutes before proceeding to the power actions.
|
||||
# In case the node has firmware_update_unresponsive_bmc_wait set we
|
||||
# give priority over the configuration option.
|
||||
wait_unres_bmc = (
|
||||
node.driver_info.get('firmware_update_unresponsive_bmc_wait')
|
||||
or CONF.redfish.firmware_update_wait_unresponsive_bmc
|
||||
)
|
||||
LOG.debug('BMC firmware update in progress. Waiting %(wait_time)s '
|
||||
'seconds before proceeding to reboot the node %(node_uuid)s '
|
||||
'to complete the step', {'node_uuid': node.uuid,
|
||||
'wait_time': wait_unres_bmc})
|
||||
# TODO(iurygregory): Improve the logic here to identify if the BMC
|
||||
# is back, so we don't have to unconditionally wait.
|
||||
# The wait_unres_bmc will be the maximum time to wait.
|
||||
time.sleep(wait_unres_bmc)
|
||||
LOG.debug('Wait completed. Proceeding to reboot the node '
|
||||
'%(node_uuid)s to complete the step.',
|
||||
{'node_uuid': node.uuid})
|
||||
|
||||
fw_upd['task_monitor'] = task_monitor.task_monitor_uri
|
||||
node.set_driver_internal_info('redfish_fw_updates', settings)
|
||||
|
||||
|
@@ -71,7 +71,12 @@ OPTIONAL_PROPERTIES = {
|
||||
'redfish_auth_type': _('Redfish HTTP client authentication method. Can be '
|
||||
'"basic", "session" or "auto". If not set, the '
|
||||
'default value is taken from Ironic '
|
||||
'configuration as ``[redfish]auth_type`` option.')
|
||||
'configuration as ``[redfish]auth_type`` option.'),
|
||||
'firmware_update_unresponsive_bmc_wait': _(
|
||||
'Number of seconds to wait to avoid the BMC becoming unresponsive '
|
||||
'during firmware updates. If not set, the default value is taken from '
|
||||
'the Ironic configuration ``firmware_update_wait_unresponsive_bmc`` '
|
||||
'in the ``[redfish]`` section.')
|
||||
}
|
||||
|
||||
COMMON_PROPERTIES = REQUIRED_PROPERTIES.copy()
|
||||
@@ -193,6 +198,10 @@ def parse_driver_info(node):
|
||||
if root_prefix:
|
||||
sushy_params['root_prefix'] = root_prefix
|
||||
|
||||
unres_bmc_wait = driver_info.get('firmware_update_unresponsive_bmc_wait')
|
||||
if unres_bmc_wait:
|
||||
sushy_params['firmware_update_unresponsive_bmc_wait'] = unres_bmc_wait
|
||||
|
||||
return sushy_params
|
||||
|
||||
|
||||
|
@@ -13,8 +13,10 @@
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_utils import timeutils
|
||||
import sushy
|
||||
|
||||
@@ -352,6 +354,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
|
||||
self.node.save()
|
||||
self._test_invalid_settings_service()
|
||||
|
||||
@mock.patch.object(time, 'sleep', lambda seconds: None)
|
||||
def _generate_new_driver_internal_info(self, components=[], invalid=False,
|
||||
add_wait=False, wait=1):
|
||||
bmc_component = {'component': 'bmc', 'url': 'https://bmc/v1.0.1'}
|
||||
@@ -795,6 +798,8 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
|
||||
def test_continue_updates_more_updates(self, get_system_collection_mock,
|
||||
node_power_action_mock,
|
||||
log_mock):
|
||||
cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 0,
|
||||
'redfish')
|
||||
self._generate_new_driver_internal_info(['bmc', 'bios'])
|
||||
|
||||
task_monitor_mock = mock.Mock()
|
||||
@@ -829,6 +834,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
|
||||
@mock.patch.object(time, 'sleep', lambda seconds: None)
|
||||
def test__execute_firmware_update_no_targets(self,
|
||||
get_system_collection_mock,
|
||||
system_mock):
|
||||
@@ -855,6 +861,7 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
|
||||
@mock.patch.object(time, 'sleep', lambda seconds: None)
|
||||
def test__execute_firmware_update_targets(self,
|
||||
get_system_collection_mock,
|
||||
system_mock):
|
||||
@@ -878,3 +885,79 @@ class RedfishFirmwareTestCase(db_base.DbTestCase):
|
||||
settings)
|
||||
update_service_mock.simple_update.assert_called_once_with(
|
||||
'https://bios/v1.0.1', targets=[mock.ANY])
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
|
||||
@mock.patch.object(time, 'sleep', autospec=True)
|
||||
def test__execute_firmware_update_unresponsive_bmc(self, sleep_mock,
|
||||
get_sys_collec_mock,
|
||||
system_mock):
|
||||
cfg.CONF.set_override('firmware_update_wait_unresponsive_bmc', 1,
|
||||
'redfish')
|
||||
self._generate_new_driver_internal_info(['bmc'])
|
||||
with open(
|
||||
'ironic/tests/json_samples/systems_collection_single.json'
|
||||
) as f:
|
||||
resp_obj = json.load(f)
|
||||
system_collection_mock = mock.MagicMock()
|
||||
system_collection_mock.get_members.return_value = resp_obj['Members']
|
||||
get_sys_collec_mock.return_value = system_collection_mock
|
||||
|
||||
task_monitor_mock = mock.Mock()
|
||||
task_monitor_mock.task_monitor_uri = '/task/2'
|
||||
update_service_mock = mock.Mock()
|
||||
update_service_mock.simple_update.return_value = task_monitor_mock
|
||||
|
||||
firmware = redfish_fw.RedfishFirmware()
|
||||
|
||||
settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}]
|
||||
firmware._execute_firmware_update(self.node, update_service_mock,
|
||||
settings)
|
||||
|
||||
update_service_mock.simple_update.assert_called_once_with(
|
||||
'https://bmc/v1.2.3')
|
||||
sleep_mock.assert_called_once_with(
|
||||
CONF.redfish.firmware_update_wait_unresponsive_bmc)
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system_collection', autospec=True)
|
||||
@mock.patch.object(time, 'sleep', autospec=True)
|
||||
def test__execute_firmware_update_unresponsive_bmc_node_override(
|
||||
self, sleep_mock, get_sys_collec_mock, system_mock):
|
||||
self._generate_new_driver_internal_info(['bmc'])
|
||||
# Set a specific value for firmware_update_unresponsive_bmc_wait for
|
||||
# the node
|
||||
with mock.patch('time.sleep', lambda x: None):
|
||||
d_info = self.node.driver_info.copy()
|
||||
d_info['firmware_update_unresponsive_bmc_wait'] = 1
|
||||
self.node.driver_info = d_info
|
||||
self.node.save()
|
||||
|
||||
self.assertNotEqual(
|
||||
CONF.redfish.firmware_update_wait_unresponsive_bmc,
|
||||
self.node.driver_info.get('firmware_update_unresponsive_bmc_wait')
|
||||
)
|
||||
|
||||
with open(
|
||||
'ironic/tests/json_samples/systems_collection_single.json'
|
||||
) as f:
|
||||
resp_obj = json.load(f)
|
||||
system_collection_mock = mock.MagicMock()
|
||||
system_collection_mock.get_members.return_value = resp_obj['Members']
|
||||
get_sys_collec_mock.return_value = system_collection_mock
|
||||
|
||||
task_monitor_mock = mock.Mock()
|
||||
task_monitor_mock.task_monitor_uri = '/task/2'
|
||||
update_service_mock = mock.Mock()
|
||||
update_service_mock.simple_update.return_value = task_monitor_mock
|
||||
|
||||
firmware = redfish_fw.RedfishFirmware()
|
||||
settings = [{'component': 'bmc', 'url': 'https://bmc/v1.2.3'}]
|
||||
firmware._execute_firmware_update(self.node, update_service_mock,
|
||||
settings)
|
||||
|
||||
update_service_mock.simple_update.assert_called_once_with(
|
||||
'https://bmc/v1.2.3')
|
||||
sleep_mock.assert_called_once_with(
|
||||
self.node.driver_info.get('firmware_update_unresponsive_bmc_wait')
|
||||
)
|
||||
|
@@ -59,6 +59,14 @@ class RedfishUtilsTestCase(db_base.DbTestCase):
|
||||
response = redfish_utils.parse_driver_info(self.node)
|
||||
self.assertEqual(self.parsed_driver_info, response)
|
||||
|
||||
def test_parse_driver_info_firmware_update_unresponsive_bmc_wait_set(
|
||||
self):
|
||||
self.node.driver_info['firmware_update_unresponsive_bmc_wait'] = 30
|
||||
self.parsed_driver_info = redfish_utils.parse_driver_info(self.node)
|
||||
self.assertEqual(
|
||||
self.parsed_driver_info['firmware_update_unresponsive_bmc_wait'],
|
||||
30)
|
||||
|
||||
def test_parse_driver_info_default_scheme(self):
|
||||
self.node.driver_info['redfish_address'] = 'example.com'
|
||||
response = redfish_utils.parse_driver_info(self.node)
|
||||
@@ -267,6 +275,7 @@ class RedfishUtilsAuthTestCase(db_base.DbTestCase):
|
||||
'password': 'password',
|
||||
'verify_ca': True,
|
||||
'auth_type': 'auto',
|
||||
'firmware_update_unresponsive_bmc_wait': 300,
|
||||
'node_uuid': self.node.uuid
|
||||
}
|
||||
|
||||
@@ -421,6 +430,7 @@ class RedfishUtilsSystemTestCase(db_base.DbTestCase):
|
||||
'password': 'password',
|
||||
'verify_ca': True,
|
||||
'auth_type': 'auto',
|
||||
'firmware_update_unresponsive_bmc_wait': 300,
|
||||
'node_uuid': self.node.uuid
|
||||
}
|
||||
|
||||
|
13
releasenotes/notes/bug-2092398-45f65c06a84d396a.yaml
Normal file
13
releasenotes/notes/bug-2092398-45f65c06a84d396a.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
[`Bug 2092398 <https://bugs.launchpad.net/ironic/+bug/2092398>`_]
|
||||
Fixes an issue with node servicing/cleaning that caused the node to enter
|
||||
into `service failed` or `clean failed` state after doing a bmc firmware
|
||||
update, due to the BMC being unresponsive to requests during the update.
|
||||
Now when doing a BMC update, we wait some time before proceeding with the
|
||||
reboot to finish the update.
|
||||
The time is configurable and can be changed via the config option
|
||||
``[redfish]firmware_update_wait_unresponsive_bmc`` (default, 300 seconds)
|
||||
or by setting ``firmware_update_unresponsive_bmc_wait`` in the
|
||||
``driver-info``.
|
Reference in New Issue
Block a user