From faaf6cb687feda63f3b0177348f75f277f989dcd Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Thu, 18 Sep 2025 10:06:48 -0700 Subject: [PATCH] Improve Dell Asynchronous task handling Dell hardware, specifically with iDRAC10 BMCs, have been observed in the wild returning a URL which does not yet exist for a job location reference. The result is we need to retry the interaction separately to obtain a reference location before proceeding. Assisted-By: Claude Code - Claude Sonnet 4 Change-Id: I22c198af55c5b0394d680c435e511c9d74729658 Signed-off-by: Julia Kreger --- ...c-404-race-condition-6000e47436cd4d6e.yaml | 12 ++ sushy/oem/dell/asynchronous.py | 25 +++- .../tests/oem/dell/unit/test_asynchronous.py | 109 ++++++++++++++++++ 3 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/fix-dell-bmc-404-race-condition-6000e47436cd4d6e.yaml diff --git a/releasenotes/notes/fix-dell-bmc-404-race-condition-6000e47436cd4d6e.yaml b/releasenotes/notes/fix-dell-bmc-404-race-condition-6000e47436cd4d6e.yaml new file mode 100644 index 00000000..c62be8ef --- /dev/null +++ b/releasenotes/notes/fix-dell-bmc-404-race-condition-6000e47436cd4d6e.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + An issue was encountered on Dell hardware where the Baseboard Management + Controller could respond with a HTTP 404 error temporarily during job + creation, causing operations to fail unnecessarily. This occurred due to + a race condition where the BMC had not yet finished creating the job + when the initial request was made. The Dell OEM http_call method now + automatically retries on 404 responses with configurable retry count + (default: 3 retries) and delay (default: 10 seconds) to handle this + transient condition gracefully. The retry behavior can be customized + using the ``max_404_retries`` and ``retry_404_delay`` parameters. \ No newline at end of file diff --git a/sushy/oem/dell/asynchronous.py b/sushy/oem/dell/asynchronous.py index ca1ba7db..5c9c8c87 100644 --- a/sushy/oem/dell/asynchronous.py +++ b/sushy/oem/dell/asynchronous.py @@ -36,11 +36,30 @@ def http_call(conn, method, *args, **kwargs): handle = getattr(conn, method.lower()) sleep_for = kwargs.pop('sushy_task_poll_period', TASK_POLL_PERIOD) + max_404_retries = kwargs.pop('max_404_retries', 3) + retry_404_delay = kwargs.pop('retry_404_delay', 10) - response = handle(*args, **kwargs) + # Retry initial call on 404 to handle BMC race condition + for attempt in range(max_404_retries + 1): + response = handle(*args, **kwargs) - LOG.debug('Finished HTTP %s with args %s %s, response is ' - '%d', method, args or '', kwargs, response.status_code) + LOG.debug('Finished HTTP %s with args %s %s, response is ' + '%d', method, args or '', kwargs, response.status_code) + + # If not 404 or last attempt, break out of retry loop + if response.status_code != 404 or attempt == max_404_retries: + break + + LOG.warning('BMC returned 404 for HTTP %(method)s request to ' + '%(path)s - this may be a transient condition while ' + 'BMC creates the job. Retrying in %(delay)d seconds ' + '(attempt %(current)d of %(total)d)', + {'method': method.upper(), + 'path': args[0] if args else 'unknown', + 'delay': retry_404_delay, + 'current': attempt + 1, + 'total': max_404_retries}) + time.sleep(retry_404_delay) location = None while response.status_code == 202: diff --git a/sushy/tests/oem/dell/unit/test_asynchronous.py b/sushy/tests/oem/dell/unit/test_asynchronous.py index 2740c12d..e4e670f0 100644 --- a/sushy/tests/oem/dell/unit/test_asynchronous.py +++ b/sushy/tests/oem/dell/unit/test_asynchronous.py @@ -53,3 +53,112 @@ class AsychronousTestCase(BaseTestCase): self.assertRaises(sushy.exceptions.ExtensionError, http_call, self.conn, 'POST') + + @mock.patch('time.sleep', autospec=True) + def test_http_call_404_retry_success(self, mock_sleep): + # First call returns 404, second call succeeds + mock_404_response = mock.Mock() + mock_404_response.status_code = 404 + + mock_200_response = mock.Mock() + mock_200_response.status_code = 200 + + self.conn.post.side_effect = [mock_404_response, mock_200_response] + + resp = http_call(self.conn, 'POST', '/some/path') + + self.assertIs(resp, mock_200_response) + self.assertEqual(self.conn.post.call_count, 2) + mock_sleep.assert_called_once_with(10) # Default retry delay + + @mock.patch('time.sleep', autospec=True) + def test_http_call_404_retry_custom_params(self, mock_sleep): + # Test with custom retry parameters + mock_404_response = mock.Mock() + mock_404_response.status_code = 404 + + mock_200_response = mock.Mock() + mock_200_response.status_code = 200 + + self.conn.post.side_effect = [mock_404_response, mock_200_response] + + resp = http_call(self.conn, 'POST', '/some/path', + max_404_retries=5, retry_404_delay=2) + + self.assertIs(resp, mock_200_response) + self.assertEqual(self.conn.post.call_count, 2) + mock_sleep.assert_called_once_with(2) # Custom delay + + @mock.patch('time.sleep', autospec=True) + def test_http_call_404_retry_exhausted(self, mock_sleep): + # All retries return 404, should fail with ExtensionError + mock_404_response = mock.Mock() + mock_404_response.status_code = 404 + + self.conn.post.return_value = mock_404_response + + self.assertRaisesRegex(sushy.exceptions.ExtensionError, + 'failed with code 404', + http_call, self.conn, 'POST', '/some/path') + + # 1 initial + 3 retries + self.assertEqual(self.conn.post.call_count, 4) + self.assertEqual(mock_sleep.call_count, 3) + + @mock.patch('time.sleep', autospec=True) + def test_http_call_non_404_error_no_retry(self, mock_sleep): + # Non-404 errors should not trigger retry + mock_500_response = mock.Mock() + mock_500_response.status_code = 500 + + self.conn.post.return_value = mock_500_response + + self.assertRaisesRegex(sushy.exceptions.ExtensionError, + 'failed with code 500', + http_call, self.conn, 'POST', '/some/path') + + self.assertEqual(self.conn.post.call_count, 1) # No retries + mock_sleep.assert_not_called() + + def test_http_call_success_no_retry(self): + # Successful response should not trigger any retry logic + mock_200_response = mock.Mock() + mock_200_response.status_code = 200 + + self.conn.post.return_value = mock_200_response + + resp = http_call(self.conn, 'POST', '/some/path') + + self.assertIs(resp, mock_200_response) + self.assertEqual(self.conn.post.call_count, 1) + + @mock.patch('time.sleep', autospec=True) + def test_http_call_404_retry_with_202_polling(self, mock_sleep): + # Test 404 retry followed by 202 polling behavior + mock_404_response = mock.Mock() + mock_404_response.status_code = 404 + + mock_202_response = mock.Mock() + mock_202_response.status_code = 202 + + def mock_get_header(header_name, default=None): + if header_name == 'Location': + return '/redfish/v1/TaskService/Tasks/123' + elif header_name == 'Retry-After': + return None + return default + mock_202_response.headers.get.side_effect = mock_get_header + + mock_200_response = mock.Mock() + mock_200_response.status_code = 200 + + self.conn.post.side_effect = [mock_404_response, mock_202_response] + self.conn.get.return_value = mock_200_response + + resp = http_call(self.conn, 'POST', '/some/path') + + self.assertIs(resp, mock_200_response) + self.assertEqual(self.conn.post.call_count, 2) + self.assertEqual(self.conn.get.call_count, 1) + # Should have slept for 404 retry and 202 polling + self.assertEqual(mock_sleep.call_count, 2)