Improve Dell Asynchronous task handling

Dell hardware, specifically with iDRAC10 BMCs, have been observed
in the wild returning a URL which does not yet exist for a job location
reference.

The result is we need to retry the interaction separately to obtain
a reference location before proceeding.

Assisted-By: Claude Code - Claude Sonnet 4
Change-Id: I22c198af55c5b0394d680c435e511c9d74729658
Signed-off-by: Julia Kreger <juliaashleykreger@gmail.com>
This commit is contained in:
Julia Kreger
2025-09-18 10:06:48 -07:00
parent 2ac52040d5
commit faaf6cb687
3 changed files with 143 additions and 3 deletions

View File

@@ -0,0 +1,12 @@
---
fixes:
- |
An issue was encountered on Dell hardware where the Baseboard Management
Controller could respond with a HTTP 404 error temporarily during job
creation, causing operations to fail unnecessarily. This occurred due to
a race condition where the BMC had not yet finished creating the job
when the initial request was made. The Dell OEM http_call method now
automatically retries on 404 responses with configurable retry count
(default: 3 retries) and delay (default: 10 seconds) to handle this
transient condition gracefully. The retry behavior can be customized
using the ``max_404_retries`` and ``retry_404_delay`` parameters.

View File

@@ -36,11 +36,30 @@ def http_call(conn, method, *args, **kwargs):
handle = getattr(conn, method.lower())
sleep_for = kwargs.pop('sushy_task_poll_period', TASK_POLL_PERIOD)
max_404_retries = kwargs.pop('max_404_retries', 3)
retry_404_delay = kwargs.pop('retry_404_delay', 10)
response = handle(*args, **kwargs)
# Retry initial call on 404 to handle BMC race condition
for attempt in range(max_404_retries + 1):
response = handle(*args, **kwargs)
LOG.debug('Finished HTTP %s with args %s %s, response is '
'%d', method, args or '', kwargs, response.status_code)
LOG.debug('Finished HTTP %s with args %s %s, response is '
'%d', method, args or '', kwargs, response.status_code)
# If not 404 or last attempt, break out of retry loop
if response.status_code != 404 or attempt == max_404_retries:
break
LOG.warning('BMC returned 404 for HTTP %(method)s request to '
'%(path)s - this may be a transient condition while '
'BMC creates the job. Retrying in %(delay)d seconds '
'(attempt %(current)d of %(total)d)',
{'method': method.upper(),
'path': args[0] if args else 'unknown',
'delay': retry_404_delay,
'current': attempt + 1,
'total': max_404_retries})
time.sleep(retry_404_delay)
location = None
while response.status_code == 202:

View File

@@ -53,3 +53,112 @@ class AsychronousTestCase(BaseTestCase):
self.assertRaises(sushy.exceptions.ExtensionError,
http_call, self.conn, 'POST')
@mock.patch('time.sleep', autospec=True)
def test_http_call_404_retry_success(self, mock_sleep):
# First call returns 404, second call succeeds
mock_404_response = mock.Mock()
mock_404_response.status_code = 404
mock_200_response = mock.Mock()
mock_200_response.status_code = 200
self.conn.post.side_effect = [mock_404_response, mock_200_response]
resp = http_call(self.conn, 'POST', '/some/path')
self.assertIs(resp, mock_200_response)
self.assertEqual(self.conn.post.call_count, 2)
mock_sleep.assert_called_once_with(10) # Default retry delay
@mock.patch('time.sleep', autospec=True)
def test_http_call_404_retry_custom_params(self, mock_sleep):
# Test with custom retry parameters
mock_404_response = mock.Mock()
mock_404_response.status_code = 404
mock_200_response = mock.Mock()
mock_200_response.status_code = 200
self.conn.post.side_effect = [mock_404_response, mock_200_response]
resp = http_call(self.conn, 'POST', '/some/path',
max_404_retries=5, retry_404_delay=2)
self.assertIs(resp, mock_200_response)
self.assertEqual(self.conn.post.call_count, 2)
mock_sleep.assert_called_once_with(2) # Custom delay
@mock.patch('time.sleep', autospec=True)
def test_http_call_404_retry_exhausted(self, mock_sleep):
# All retries return 404, should fail with ExtensionError
mock_404_response = mock.Mock()
mock_404_response.status_code = 404
self.conn.post.return_value = mock_404_response
self.assertRaisesRegex(sushy.exceptions.ExtensionError,
'failed with code 404',
http_call, self.conn, 'POST', '/some/path')
# 1 initial + 3 retries
self.assertEqual(self.conn.post.call_count, 4)
self.assertEqual(mock_sleep.call_count, 3)
@mock.patch('time.sleep', autospec=True)
def test_http_call_non_404_error_no_retry(self, mock_sleep):
# Non-404 errors should not trigger retry
mock_500_response = mock.Mock()
mock_500_response.status_code = 500
self.conn.post.return_value = mock_500_response
self.assertRaisesRegex(sushy.exceptions.ExtensionError,
'failed with code 500',
http_call, self.conn, 'POST', '/some/path')
self.assertEqual(self.conn.post.call_count, 1) # No retries
mock_sleep.assert_not_called()
def test_http_call_success_no_retry(self):
# Successful response should not trigger any retry logic
mock_200_response = mock.Mock()
mock_200_response.status_code = 200
self.conn.post.return_value = mock_200_response
resp = http_call(self.conn, 'POST', '/some/path')
self.assertIs(resp, mock_200_response)
self.assertEqual(self.conn.post.call_count, 1)
@mock.patch('time.sleep', autospec=True)
def test_http_call_404_retry_with_202_polling(self, mock_sleep):
# Test 404 retry followed by 202 polling behavior
mock_404_response = mock.Mock()
mock_404_response.status_code = 404
mock_202_response = mock.Mock()
mock_202_response.status_code = 202
def mock_get_header(header_name, default=None):
if header_name == 'Location':
return '/redfish/v1/TaskService/Tasks/123'
elif header_name == 'Retry-After':
return None
return default
mock_202_response.headers.get.side_effect = mock_get_header
mock_200_response = mock.Mock()
mock_200_response.status_code = 200
self.conn.post.side_effect = [mock_404_response, mock_202_response]
self.conn.get.return_value = mock_200_response
resp = http_call(self.conn, 'POST', '/some/path')
self.assertIs(resp, mock_200_response)
self.assertEqual(self.conn.post.call_count, 2)
self.assertEqual(self.conn.get.call_count, 1)
# Should have slept for 404 retry and 202 polling
self.assertEqual(mock_sleep.call_count, 2)