update retryable errors & instance fault on retry
HTTP errors can be split into a few categories: client ephemeral, server ephemeral, server permanent, and globally permanent. You could argue there is even more permutations. However, for simplicity, these errors can be viewed as ephemeral and permanent. The Glance XenAPI plugin has been updated to raise a RetryableError for ephemeral and unexpected (i.e. errors that are categorized as neither permanent or ephemeral) errors. Additionally, an instance fault will be logged every time an error occurs. This will serve as transaction history where every attempt is a transaction on the state. If an ephemeral error occurs, there is a retry, then a permanent error, the history of each error will be in the instance_faults. Deployers should configure the num_retries relative to the number of api_servers. Right now, servers are put in random order and then cycled. Trying the same server multiple times could cause unnecessary load and delays. However, trying multiple servers, is ideal when a single server is behaving badly or cannot reach another server it needs to communicate with to fulfill a request. Closes-Bug: 1380776 Change-Id: I267a5b524c3ff8a28edf1a2285b77bb09049773c
This commit is contained in:
@@ -69,7 +69,8 @@ glance_opts = [
|
||||
deprecated_name='glance_api_insecure'),
|
||||
cfg.IntOpt('num_retries',
|
||||
default=0,
|
||||
help='Number of retries when downloading an image from glance',
|
||||
help='Number of retries when uploading / downloading an image '
|
||||
'to / from glance.',
|
||||
deprecated_group='DEFAULT',
|
||||
deprecated_name='glance_num_retries'),
|
||||
cfg.ListOpt('allowed_direct_url_schemes',
|
||||
|
@@ -112,14 +112,16 @@ class CallPluginTestCase(stubs.XenAPITestBaseNoDB):
|
||||
fn = 'download_vhd'
|
||||
num_retries = 1
|
||||
callback = None
|
||||
retry_cb = mock.Mock()
|
||||
with mock.patch.object(self.session, 'call_plugin_serialized',
|
||||
autospec=True) as call_plugin_serialized:
|
||||
call_plugin_serialized.side_effect = exc
|
||||
self.assertRaises(exception.PluginRetriesExceeded,
|
||||
self.session.call_plugin_serialized_with_retry, plugin, fn,
|
||||
num_retries, callback)
|
||||
num_retries, callback, retry_cb)
|
||||
call_plugin_serialized.assert_called_with(plugin, fn)
|
||||
self.assertEqual(2, call_plugin_serialized.call_count)
|
||||
self.assertEqual(2, retry_cb.call_count)
|
||||
|
||||
def test_serialized_with_retry_socket_error_reraised(self):
|
||||
exc = socket.error
|
||||
@@ -128,10 +130,29 @@ class CallPluginTestCase(stubs.XenAPITestBaseNoDB):
|
||||
fn = 'download_vhd'
|
||||
num_retries = 1
|
||||
callback = None
|
||||
retry_cb = mock.Mock()
|
||||
with mock.patch.object(self.session, 'call_plugin_serialized',
|
||||
autospec=True) as call_plugin_serialized:
|
||||
call_plugin_serialized.side_effect = exc
|
||||
self.assertRaises(socket.error,
|
||||
self.session.call_plugin_serialized_with_retry, plugin, fn,
|
||||
num_retries, callback)
|
||||
num_retries, callback, retry_cb)
|
||||
call_plugin_serialized.assert_called_once_with(plugin, fn)
|
||||
self.assertEqual(0, retry_cb.call_count)
|
||||
|
||||
def test_serialized_with_retry_socket_reset_reraised(self):
|
||||
exc = socket.error
|
||||
exc.errno = errno.ECONNRESET
|
||||
plugin = 'glance'
|
||||
fn = 'download_vhd'
|
||||
num_retries = 1
|
||||
callback = None
|
||||
retry_cb = mock.Mock()
|
||||
with mock.patch.object(self.session, 'call_plugin_serialized',
|
||||
autospec=True) as call_plugin_serialized:
|
||||
call_plugin_serialized.side_effect = exc
|
||||
self.assertRaises(exception.PluginRetriesExceeded,
|
||||
self.session.call_plugin_serialized_with_retry, plugin, fn,
|
||||
num_retries, callback, retry_cb)
|
||||
call_plugin_serialized.assert_called_with(plugin, fn)
|
||||
self.assertEqual(2, call_plugin_serialized.call_count)
|
||||
|
@@ -17,7 +17,9 @@ import random
|
||||
import time
|
||||
|
||||
import mock
|
||||
from mox3 import mox
|
||||
|
||||
from nova.compute import utils as compute_utils
|
||||
from nova import context
|
||||
from nova import exception
|
||||
from nova.openstack.common import log as logging
|
||||
@@ -93,11 +95,11 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
|
||||
@mock.patch.object(vm_utils, '_make_uuid_stack', return_value=['uuid1'])
|
||||
@mock.patch.object(random, 'shuffle')
|
||||
@mock.patch.object(time, 'sleep')
|
||||
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
|
||||
@mock.patch.object(logging.getLogger('nova.virt.xenapi.client.session'),
|
||||
'debug')
|
||||
def test_download_image_retry(self, mock_log_debug,
|
||||
mock_sleep, mock_shuffle,
|
||||
mock_make_uuid_stack):
|
||||
def test_download_image_retry(self, mock_log_debug, mock_fault, mock_sleep,
|
||||
mock_shuffle, mock_make_uuid_stack):
|
||||
params = self._get_download_params()
|
||||
self.flags(num_retries=2, group='glance')
|
||||
|
||||
@@ -132,6 +134,8 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
|
||||
mock_call_plugin_serialized.assert_has_calls(calls)
|
||||
mock_log_debug.assert_has_calls(log_calls, any_order=True)
|
||||
|
||||
self.assertEqual(1, mock_fault.call_count)
|
||||
|
||||
def _get_upload_params(self, auto_disk_config=True,
|
||||
expected_os_type='default'):
|
||||
params = self._get_params()
|
||||
@@ -186,16 +190,29 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
|
||||
|
||||
self.mox.StubOutWithMock(self.session, 'call_plugin_serialized')
|
||||
self.mox.StubOutWithMock(time, 'sleep')
|
||||
self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc')
|
||||
error_details = ["", "", "RetryableError", ""]
|
||||
error = self.session.XenAPI.Failure(details=error_details)
|
||||
self.session.call_plugin_serialized('glance', 'upload_vhd',
|
||||
**params).AndRaise(error)
|
||||
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
|
||||
error, (fake.Failure,
|
||||
error,
|
||||
mox.IgnoreArg()))
|
||||
time.sleep(0.5)
|
||||
self.session.call_plugin_serialized('glance', 'upload_vhd',
|
||||
**params).AndRaise(error)
|
||||
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
|
||||
error, (fake.Failure,
|
||||
error,
|
||||
mox.IgnoreArg()))
|
||||
time.sleep(1)
|
||||
self.session.call_plugin_serialized('glance', 'upload_vhd',
|
||||
**params).AndRaise(error)
|
||||
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
|
||||
error, (fake.Failure,
|
||||
error,
|
||||
mox.IgnoreArg()))
|
||||
self.mox.ReplayAll()
|
||||
|
||||
self.assertRaises(exception.CouldNotUploadImage,
|
||||
@@ -210,16 +227,25 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
|
||||
|
||||
self.mox.StubOutWithMock(self.session, 'call_plugin_serialized')
|
||||
self.mox.StubOutWithMock(time, 'sleep')
|
||||
self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc')
|
||||
error_details = ["", "task signaled", "", ""]
|
||||
error = self.session.XenAPI.Failure(details=error_details)
|
||||
self.session.call_plugin_serialized('glance', 'upload_vhd',
|
||||
**params).AndRaise(error)
|
||||
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
|
||||
error, (fake.Failure,
|
||||
error,
|
||||
mox.IgnoreArg()))
|
||||
time.sleep(0.5)
|
||||
# Note(johngarbutt) XenServer 6.1 and later has this error
|
||||
error_details = ["", "signal: SIGTERM", "", ""]
|
||||
error = self.session.XenAPI.Failure(details=error_details)
|
||||
self.session.call_plugin_serialized('glance', 'upload_vhd',
|
||||
**params).AndRaise(error)
|
||||
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
|
||||
error, (fake.Failure,
|
||||
error,
|
||||
mox.IgnoreArg()))
|
||||
time.sleep(1)
|
||||
self.session.call_plugin_serialized('glance', 'upload_vhd',
|
||||
**params)
|
||||
|
@@ -276,7 +276,7 @@ class FetchVhdImageTestCase(VMUtilsTestBase):
|
||||
self.mox.StubOutWithMock(
|
||||
self.session, 'call_plugin_serialized_with_retry')
|
||||
func = self.session.call_plugin_serialized_with_retry(
|
||||
'glance', 'download_vhd', 0, mox.IgnoreArg(),
|
||||
'glance', 'download_vhd', 0, mox.IgnoreArg(), mox.IgnoreArg(),
|
||||
extra_headers={'X-Service-Catalog': '[]',
|
||||
'X-Auth-Token': 'auth_token',
|
||||
'X-Roles': '',
|
||||
|
@@ -212,7 +212,8 @@ class XenAPISession(object):
|
||||
return pickle.loads(rv)
|
||||
|
||||
def call_plugin_serialized_with_retry(self, plugin, fn, num_retries,
|
||||
callback, *args, **kwargs):
|
||||
callback, retry_cb=None, *args,
|
||||
**kwargs):
|
||||
"""Allows a plugin to raise RetryableError so we can try again."""
|
||||
attempts = num_retries + 1
|
||||
sleep_time = 0.5
|
||||
@@ -237,6 +238,8 @@ class XenAPISession(object):
|
||||
if self._is_retryable_exception(exc, fn):
|
||||
LOG.warn(_('%(plugin)s.%(fn)s failed. Retrying call.')
|
||||
% {'plugin': plugin, 'fn': fn})
|
||||
if retry_cb:
|
||||
retry_cb(exc=exc)
|
||||
else:
|
||||
raise
|
||||
except socket.error as exc:
|
||||
@@ -244,6 +247,8 @@ class XenAPISession(object):
|
||||
LOG.warn(_('Lost connection to XenAPI during call to '
|
||||
'%(plugin)s.%(fn)s. Retrying call.') %
|
||||
{'plugin': plugin, 'fn': fn})
|
||||
if retry_cb:
|
||||
retry_cb(exc=exc)
|
||||
else:
|
||||
raise
|
||||
|
||||
|
@@ -13,19 +13,25 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import functools
|
||||
import sys
|
||||
|
||||
from oslo.config import cfg
|
||||
|
||||
from nova.compute import utils as compute_utils
|
||||
from nova import exception
|
||||
from nova.image import glance
|
||||
from nova.openstack.common import log as logging
|
||||
from nova import utils
|
||||
from nova.virt.xenapi import vm_utils
|
||||
|
||||
CONF = cfg.CONF
|
||||
CONF.import_opt('num_retries', 'nova.image.glance', group='glance')
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GlanceStore(object):
|
||||
def _call_glance_plugin(self, session, fn, params):
|
||||
def _call_glance_plugin(self, context, instance, session, fn, params):
|
||||
glance_api_servers = glance.get_api_servers()
|
||||
|
||||
def pick_glance(kwargs):
|
||||
@@ -35,8 +41,17 @@ class GlanceStore(object):
|
||||
kwargs['glance_use_ssl'] = g_use_ssl
|
||||
return g_host
|
||||
|
||||
def retry_cb(context, instance, exc=None):
|
||||
if exc:
|
||||
exc_info = sys.exc_info()
|
||||
LOG.debug(exc.message, exc_info=exc_info)
|
||||
compute_utils.add_instance_fault_from_exc(
|
||||
context, instance, exc, exc_info)
|
||||
|
||||
cb = functools.partial(retry_cb, context, instance)
|
||||
|
||||
return session.call_plugin_serialized_with_retry(
|
||||
'glance', fn, CONF.glance.num_retries, pick_glance, **params)
|
||||
'glance', fn, CONF.glance.num_retries, pick_glance, cb, **params)
|
||||
|
||||
def _make_params(self, context, session, image_id):
|
||||
return {'image_id': image_id,
|
||||
@@ -48,7 +63,8 @@ class GlanceStore(object):
|
||||
params['uuid_stack'] = vm_utils._make_uuid_stack()
|
||||
|
||||
try:
|
||||
vdis = self._call_glance_plugin(session, 'download_vhd', params)
|
||||
vdis = self._call_glance_plugin(context, instance, session,
|
||||
'download_vhd', params)
|
||||
except exception.PluginRetriesExceeded:
|
||||
raise exception.CouldNotFetchImage(image_id=image_id)
|
||||
|
||||
@@ -72,6 +88,7 @@ class GlanceStore(object):
|
||||
props["auto_disk_config"] = "disabled"
|
||||
|
||||
try:
|
||||
self._call_glance_plugin(session, 'upload_vhd', params)
|
||||
self._call_glance_plugin(context, instance, session,
|
||||
'upload_vhd', params)
|
||||
except exception.PluginRetriesExceeded:
|
||||
raise exception.CouldNotUploadImage(image_id=image_id)
|
||||
|
@@ -25,9 +25,8 @@ import md5
|
||||
import socket
|
||||
import urllib2
|
||||
|
||||
import utils
|
||||
|
||||
import pluginlib_nova
|
||||
import utils
|
||||
|
||||
|
||||
pluginlib_nova.configure_logging('glance')
|
||||
@@ -205,7 +204,7 @@ def _upload_tarball(staging_path, image_id, glance_host, glance_port,
|
||||
None, staging_path, callback=send_chunked_transfer_encoded,
|
||||
compression_level=compression_level)
|
||||
finally:
|
||||
send_chunked_transfer_encoded('') # Chunked-Transfer terminator
|
||||
send_chunked_transfer_encoded('') # Chunked-Transfer terminator
|
||||
|
||||
bytes_written = callback_data['bytes_written']
|
||||
logging.info("Wrote %d bytes to %s" % (bytes_written, url))
|
||||
@@ -218,22 +217,54 @@ def _upload_tarball(staging_path, image_id, glance_host, glance_port,
|
||||
"Response Status: %i, Response body: %s"
|
||||
% (url, resp.status, resp.read()))
|
||||
|
||||
if resp.status in (httplib.UNAUTHORIZED,
|
||||
httplib.REQUEST_ENTITY_TOO_LARGE,
|
||||
httplib.PRECONDITION_FAILED,
|
||||
httplib.CONFLICT,
|
||||
httplib.FORBIDDEN,
|
||||
httplib.INTERNAL_SERVER_ERROR):
|
||||
# No point in retrying for these conditions
|
||||
raise PluginError("Got Error response [%i] while uploading "
|
||||
"image [%s] "
|
||||
"to glance host [%s:%s]"
|
||||
# Note(Jesse): This branch sorts errors into those that are permanent,
|
||||
# those that are ephemeral, and those that are unexpected.
|
||||
if resp.status in (httplib.BAD_REQUEST, # 400
|
||||
httplib.UNAUTHORIZED, # 401
|
||||
httplib.PAYMENT_REQUIRED, # 402
|
||||
httplib.FORBIDDEN, # 403
|
||||
httplib.NOT_FOUND, # 404
|
||||
httplib.METHOD_NOT_ALLOWED, # 405
|
||||
httplib.NOT_ACCEPTABLE, # 406
|
||||
httplib.PROXY_AUTHENTICATION_REQUIRED, # 407
|
||||
httplib.CONFLICT, # 409
|
||||
httplib.GONE, # 410
|
||||
httplib.LENGTH_REQUIRED, # 411
|
||||
httplib.PRECONDITION_FAILED, # 412
|
||||
httplib.REQUEST_ENTITY_TOO_LARGE, # 413
|
||||
httplib.REQUEST_URI_TOO_LONG, # 414
|
||||
httplib.UNSUPPORTED_MEDIA_TYPE, # 415
|
||||
httplib.REQUESTED_RANGE_NOT_SATISFIABLE, # 416
|
||||
httplib.EXPECTATION_FAILED, # 417
|
||||
httplib.UNPROCESSABLE_ENTITY, # 422
|
||||
httplib.LOCKED, # 423
|
||||
httplib.FAILED_DEPENDENCY, # 424
|
||||
httplib.UPGRADE_REQUIRED, # 426
|
||||
httplib.NOT_IMPLEMENTED, # 501
|
||||
httplib.HTTP_VERSION_NOT_SUPPORTED, # 505
|
||||
httplib.NOT_EXTENDED, # 510
|
||||
):
|
||||
raise PluginError("Got Permanent Error response [%i] while "
|
||||
"uploading image [%s] to glance host [%s:%s]"
|
||||
% (resp.status, image_id,
|
||||
glance_host, glance_port))
|
||||
elif resp.status in (httplib.REQUEST_TIMEOUT, # 408
|
||||
httplib.INTERNAL_SERVER_ERROR, # 500
|
||||
httplib.BAD_GATEWAY, # 502
|
||||
httplib.SERVICE_UNAVAILABLE, # 503
|
||||
httplib.GATEWAY_TIMEOUT, # 504
|
||||
httplib.INSUFFICIENT_STORAGE, # 507
|
||||
):
|
||||
raise RetryableError("Got Ephemeral Error response [%i] while "
|
||||
"uploading image [%s] to glance host [%s:%s]"
|
||||
% (resp.status, image_id,
|
||||
glance_host, glance_port))
|
||||
else:
|
||||
raise RetryableError("Unexpected response [%i] while uploading "
|
||||
"image [%s] "
|
||||
"to glance host [%s:%s]"
|
||||
# Note(Jesse): Assume unexpected errors are retryable. If you are
|
||||
# seeing this error message, the error should probably be added
|
||||
# to either the ephemeral or permanent error list.
|
||||
raise RetryableError("Got Unexpected Error response [%i] while "
|
||||
"uploading image [%s] to glance host [%s:%s]"
|
||||
% (resp.status, image_id,
|
||||
glance_host, glance_port))
|
||||
finally:
|
||||
|
Reference in New Issue
Block a user