Improve Dell Asynchronous task handling
Dell hardware, specifically with iDRAC10 BMCs, have been observed in the wild returning a URL which does not yet exist for a job location reference. The result is we need to retry the interaction separately to obtain a reference location before proceeding. Assisted-By: Claude Code - Claude Sonnet 4 Change-Id: I22c198af55c5b0394d680c435e511c9d74729658 Signed-off-by: Julia Kreger <juliaashleykreger@gmail.com>
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
An issue was encountered on Dell hardware where the Baseboard Management
|
||||
Controller could respond with a HTTP 404 error temporarily during job
|
||||
creation, causing operations to fail unnecessarily. This occurred due to
|
||||
a race condition where the BMC had not yet finished creating the job
|
||||
when the initial request was made. The Dell OEM http_call method now
|
||||
automatically retries on 404 responses with configurable retry count
|
||||
(default: 3 retries) and delay (default: 10 seconds) to handle this
|
||||
transient condition gracefully. The retry behavior can be customized
|
||||
using the ``max_404_retries`` and ``retry_404_delay`` parameters.
|
@@ -36,11 +36,30 @@ def http_call(conn, method, *args, **kwargs):
|
||||
handle = getattr(conn, method.lower())
|
||||
|
||||
sleep_for = kwargs.pop('sushy_task_poll_period', TASK_POLL_PERIOD)
|
||||
max_404_retries = kwargs.pop('max_404_retries', 3)
|
||||
retry_404_delay = kwargs.pop('retry_404_delay', 10)
|
||||
|
||||
response = handle(*args, **kwargs)
|
||||
# Retry initial call on 404 to handle BMC race condition
|
||||
for attempt in range(max_404_retries + 1):
|
||||
response = handle(*args, **kwargs)
|
||||
|
||||
LOG.debug('Finished HTTP %s with args %s %s, response is '
|
||||
'%d', method, args or '', kwargs, response.status_code)
|
||||
LOG.debug('Finished HTTP %s with args %s %s, response is '
|
||||
'%d', method, args or '', kwargs, response.status_code)
|
||||
|
||||
# If not 404 or last attempt, break out of retry loop
|
||||
if response.status_code != 404 or attempt == max_404_retries:
|
||||
break
|
||||
|
||||
LOG.warning('BMC returned 404 for HTTP %(method)s request to '
|
||||
'%(path)s - this may be a transient condition while '
|
||||
'BMC creates the job. Retrying in %(delay)d seconds '
|
||||
'(attempt %(current)d of %(total)d)',
|
||||
{'method': method.upper(),
|
||||
'path': args[0] if args else 'unknown',
|
||||
'delay': retry_404_delay,
|
||||
'current': attempt + 1,
|
||||
'total': max_404_retries})
|
||||
time.sleep(retry_404_delay)
|
||||
|
||||
location = None
|
||||
while response.status_code == 202:
|
||||
|
@@ -53,3 +53,112 @@ class AsychronousTestCase(BaseTestCase):
|
||||
|
||||
self.assertRaises(sushy.exceptions.ExtensionError,
|
||||
http_call, self.conn, 'POST')
|
||||
|
||||
@mock.patch('time.sleep', autospec=True)
|
||||
def test_http_call_404_retry_success(self, mock_sleep):
|
||||
# First call returns 404, second call succeeds
|
||||
mock_404_response = mock.Mock()
|
||||
mock_404_response.status_code = 404
|
||||
|
||||
mock_200_response = mock.Mock()
|
||||
mock_200_response.status_code = 200
|
||||
|
||||
self.conn.post.side_effect = [mock_404_response, mock_200_response]
|
||||
|
||||
resp = http_call(self.conn, 'POST', '/some/path')
|
||||
|
||||
self.assertIs(resp, mock_200_response)
|
||||
self.assertEqual(self.conn.post.call_count, 2)
|
||||
mock_sleep.assert_called_once_with(10) # Default retry delay
|
||||
|
||||
@mock.patch('time.sleep', autospec=True)
|
||||
def test_http_call_404_retry_custom_params(self, mock_sleep):
|
||||
# Test with custom retry parameters
|
||||
mock_404_response = mock.Mock()
|
||||
mock_404_response.status_code = 404
|
||||
|
||||
mock_200_response = mock.Mock()
|
||||
mock_200_response.status_code = 200
|
||||
|
||||
self.conn.post.side_effect = [mock_404_response, mock_200_response]
|
||||
|
||||
resp = http_call(self.conn, 'POST', '/some/path',
|
||||
max_404_retries=5, retry_404_delay=2)
|
||||
|
||||
self.assertIs(resp, mock_200_response)
|
||||
self.assertEqual(self.conn.post.call_count, 2)
|
||||
mock_sleep.assert_called_once_with(2) # Custom delay
|
||||
|
||||
@mock.patch('time.sleep', autospec=True)
|
||||
def test_http_call_404_retry_exhausted(self, mock_sleep):
|
||||
# All retries return 404, should fail with ExtensionError
|
||||
mock_404_response = mock.Mock()
|
||||
mock_404_response.status_code = 404
|
||||
|
||||
self.conn.post.return_value = mock_404_response
|
||||
|
||||
self.assertRaisesRegex(sushy.exceptions.ExtensionError,
|
||||
'failed with code 404',
|
||||
http_call, self.conn, 'POST', '/some/path')
|
||||
|
||||
# 1 initial + 3 retries
|
||||
self.assertEqual(self.conn.post.call_count, 4)
|
||||
self.assertEqual(mock_sleep.call_count, 3)
|
||||
|
||||
@mock.patch('time.sleep', autospec=True)
|
||||
def test_http_call_non_404_error_no_retry(self, mock_sleep):
|
||||
# Non-404 errors should not trigger retry
|
||||
mock_500_response = mock.Mock()
|
||||
mock_500_response.status_code = 500
|
||||
|
||||
self.conn.post.return_value = mock_500_response
|
||||
|
||||
self.assertRaisesRegex(sushy.exceptions.ExtensionError,
|
||||
'failed with code 500',
|
||||
http_call, self.conn, 'POST', '/some/path')
|
||||
|
||||
self.assertEqual(self.conn.post.call_count, 1) # No retries
|
||||
mock_sleep.assert_not_called()
|
||||
|
||||
def test_http_call_success_no_retry(self):
|
||||
# Successful response should not trigger any retry logic
|
||||
mock_200_response = mock.Mock()
|
||||
mock_200_response.status_code = 200
|
||||
|
||||
self.conn.post.return_value = mock_200_response
|
||||
|
||||
resp = http_call(self.conn, 'POST', '/some/path')
|
||||
|
||||
self.assertIs(resp, mock_200_response)
|
||||
self.assertEqual(self.conn.post.call_count, 1)
|
||||
|
||||
@mock.patch('time.sleep', autospec=True)
|
||||
def test_http_call_404_retry_with_202_polling(self, mock_sleep):
|
||||
# Test 404 retry followed by 202 polling behavior
|
||||
mock_404_response = mock.Mock()
|
||||
mock_404_response.status_code = 404
|
||||
|
||||
mock_202_response = mock.Mock()
|
||||
mock_202_response.status_code = 202
|
||||
|
||||
def mock_get_header(header_name, default=None):
|
||||
if header_name == 'Location':
|
||||
return '/redfish/v1/TaskService/Tasks/123'
|
||||
elif header_name == 'Retry-After':
|
||||
return None
|
||||
return default
|
||||
mock_202_response.headers.get.side_effect = mock_get_header
|
||||
|
||||
mock_200_response = mock.Mock()
|
||||
mock_200_response.status_code = 200
|
||||
|
||||
self.conn.post.side_effect = [mock_404_response, mock_202_response]
|
||||
self.conn.get.return_value = mock_200_response
|
||||
|
||||
resp = http_call(self.conn, 'POST', '/some/path')
|
||||
|
||||
self.assertIs(resp, mock_200_response)
|
||||
self.assertEqual(self.conn.post.call_count, 2)
|
||||
self.assertEqual(self.conn.get.call_count, 1)
|
||||
# Should have slept for 404 retry and 202 polling
|
||||
self.assertEqual(mock_sleep.call_count, 2)
|
||||
|
Reference in New Issue
Block a user