update retryable errors & instance fault on retry

HTTP errors can be split into a few categories: client ephemeral, server
ephemeral, server permanent, and globally permanent. You could argue
there is even more permutations. However, for simplicity, these errors
can be viewed as ephemeral and permanent.

The Glance XenAPI plugin has been updated to raise a RetryableError for
ephemeral and unexpected (i.e. errors that are categorized as neither
permanent or ephemeral) errors.

Additionally, an instance fault will be logged every time an error
occurs. This will serve as transaction history where every attempt is a
transaction on the state. If an ephemeral error occurs, there is a
retry, then a permanent error, the history of each error will be in the
instance_faults.

Deployers should configure the num_retries relative to the number of
api_servers. Right now, servers are put in random order and then cycled.
Trying the same server multiple times could cause unnecessary load and
delays. However, trying multiple servers, is ideal when a single server
is behaving badly or cannot reach another server it needs to communicate
with to fulfill a request.

Closes-Bug: 1380776

Change-Id: I267a5b524c3ff8a28edf1a2285b77bb09049773c
This commit is contained in:
Jesse J. Cook
2014-10-13 14:08:52 -05:00
parent 632a034c9a
commit cc2a34ce05
7 changed files with 129 additions and 28 deletions

View File

@@ -69,7 +69,8 @@ glance_opts = [
deprecated_name='glance_api_insecure'),
cfg.IntOpt('num_retries',
default=0,
help='Number of retries when downloading an image from glance',
help='Number of retries when uploading / downloading an image '
'to / from glance.',
deprecated_group='DEFAULT',
deprecated_name='glance_num_retries'),
cfg.ListOpt('allowed_direct_url_schemes',

View File

@@ -112,14 +112,16 @@ class CallPluginTestCase(stubs.XenAPITestBaseNoDB):
fn = 'download_vhd'
num_retries = 1
callback = None
retry_cb = mock.Mock()
with mock.patch.object(self.session, 'call_plugin_serialized',
autospec=True) as call_plugin_serialized:
call_plugin_serialized.side_effect = exc
self.assertRaises(exception.PluginRetriesExceeded,
self.session.call_plugin_serialized_with_retry, plugin, fn,
num_retries, callback)
num_retries, callback, retry_cb)
call_plugin_serialized.assert_called_with(plugin, fn)
self.assertEqual(2, call_plugin_serialized.call_count)
self.assertEqual(2, retry_cb.call_count)
def test_serialized_with_retry_socket_error_reraised(self):
exc = socket.error
@@ -128,10 +130,29 @@ class CallPluginTestCase(stubs.XenAPITestBaseNoDB):
fn = 'download_vhd'
num_retries = 1
callback = None
retry_cb = mock.Mock()
with mock.patch.object(self.session, 'call_plugin_serialized',
autospec=True) as call_plugin_serialized:
call_plugin_serialized.side_effect = exc
self.assertRaises(socket.error,
self.session.call_plugin_serialized_with_retry, plugin, fn,
num_retries, callback)
num_retries, callback, retry_cb)
call_plugin_serialized.assert_called_once_with(plugin, fn)
self.assertEqual(0, retry_cb.call_count)
def test_serialized_with_retry_socket_reset_reraised(self):
exc = socket.error
exc.errno = errno.ECONNRESET
plugin = 'glance'
fn = 'download_vhd'
num_retries = 1
callback = None
retry_cb = mock.Mock()
with mock.patch.object(self.session, 'call_plugin_serialized',
autospec=True) as call_plugin_serialized:
call_plugin_serialized.side_effect = exc
self.assertRaises(exception.PluginRetriesExceeded,
self.session.call_plugin_serialized_with_retry, plugin, fn,
num_retries, callback, retry_cb)
call_plugin_serialized.assert_called_with(plugin, fn)
self.assertEqual(2, call_plugin_serialized.call_count)

View File

@@ -17,7 +17,9 @@ import random
import time
import mock
from mox3 import mox
from nova.compute import utils as compute_utils
from nova import context
from nova import exception
from nova.openstack.common import log as logging
@@ -93,11 +95,11 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
@mock.patch.object(vm_utils, '_make_uuid_stack', return_value=['uuid1'])
@mock.patch.object(random, 'shuffle')
@mock.patch.object(time, 'sleep')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(logging.getLogger('nova.virt.xenapi.client.session'),
'debug')
def test_download_image_retry(self, mock_log_debug,
mock_sleep, mock_shuffle,
mock_make_uuid_stack):
def test_download_image_retry(self, mock_log_debug, mock_fault, mock_sleep,
mock_shuffle, mock_make_uuid_stack):
params = self._get_download_params()
self.flags(num_retries=2, group='glance')
@@ -132,6 +134,8 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
mock_call_plugin_serialized.assert_has_calls(calls)
mock_log_debug.assert_has_calls(log_calls, any_order=True)
self.assertEqual(1, mock_fault.call_count)
def _get_upload_params(self, auto_disk_config=True,
expected_os_type='default'):
params = self._get_params()
@@ -186,16 +190,29 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
self.mox.StubOutWithMock(self.session, 'call_plugin_serialized')
self.mox.StubOutWithMock(time, 'sleep')
self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc')
error_details = ["", "", "RetryableError", ""]
error = self.session.XenAPI.Failure(details=error_details)
self.session.call_plugin_serialized('glance', 'upload_vhd',
**params).AndRaise(error)
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
error, (fake.Failure,
error,
mox.IgnoreArg()))
time.sleep(0.5)
self.session.call_plugin_serialized('glance', 'upload_vhd',
**params).AndRaise(error)
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
error, (fake.Failure,
error,
mox.IgnoreArg()))
time.sleep(1)
self.session.call_plugin_serialized('glance', 'upload_vhd',
**params).AndRaise(error)
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
error, (fake.Failure,
error,
mox.IgnoreArg()))
self.mox.ReplayAll()
self.assertRaises(exception.CouldNotUploadImage,
@@ -210,16 +227,25 @@ class TestGlanceStore(stubs.XenAPITestBaseNoDB):
self.mox.StubOutWithMock(self.session, 'call_plugin_serialized')
self.mox.StubOutWithMock(time, 'sleep')
self.mox.StubOutWithMock(compute_utils, 'add_instance_fault_from_exc')
error_details = ["", "task signaled", "", ""]
error = self.session.XenAPI.Failure(details=error_details)
self.session.call_plugin_serialized('glance', 'upload_vhd',
**params).AndRaise(error)
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
error, (fake.Failure,
error,
mox.IgnoreArg()))
time.sleep(0.5)
# Note(johngarbutt) XenServer 6.1 and later has this error
error_details = ["", "signal: SIGTERM", "", ""]
error = self.session.XenAPI.Failure(details=error_details)
self.session.call_plugin_serialized('glance', 'upload_vhd',
**params).AndRaise(error)
compute_utils.add_instance_fault_from_exc(self.context, self.instance,
error, (fake.Failure,
error,
mox.IgnoreArg()))
time.sleep(1)
self.session.call_plugin_serialized('glance', 'upload_vhd',
**params)

View File

@@ -276,7 +276,7 @@ class FetchVhdImageTestCase(VMUtilsTestBase):
self.mox.StubOutWithMock(
self.session, 'call_plugin_serialized_with_retry')
func = self.session.call_plugin_serialized_with_retry(
'glance', 'download_vhd', 0, mox.IgnoreArg(),
'glance', 'download_vhd', 0, mox.IgnoreArg(), mox.IgnoreArg(),
extra_headers={'X-Service-Catalog': '[]',
'X-Auth-Token': 'auth_token',
'X-Roles': '',

View File

@@ -212,7 +212,8 @@ class XenAPISession(object):
return pickle.loads(rv)
def call_plugin_serialized_with_retry(self, plugin, fn, num_retries,
callback, *args, **kwargs):
callback, retry_cb=None, *args,
**kwargs):
"""Allows a plugin to raise RetryableError so we can try again."""
attempts = num_retries + 1
sleep_time = 0.5
@@ -237,6 +238,8 @@ class XenAPISession(object):
if self._is_retryable_exception(exc, fn):
LOG.warn(_('%(plugin)s.%(fn)s failed. Retrying call.')
% {'plugin': plugin, 'fn': fn})
if retry_cb:
retry_cb(exc=exc)
else:
raise
except socket.error as exc:
@@ -244,6 +247,8 @@ class XenAPISession(object):
LOG.warn(_('Lost connection to XenAPI during call to '
'%(plugin)s.%(fn)s. Retrying call.') %
{'plugin': plugin, 'fn': fn})
if retry_cb:
retry_cb(exc=exc)
else:
raise

View File

@@ -13,19 +13,25 @@
# License for the specific language governing permissions and limitations
# under the License.
import functools
import sys
from oslo.config import cfg
from nova.compute import utils as compute_utils
from nova import exception
from nova.image import glance
from nova.openstack.common import log as logging
from nova import utils
from nova.virt.xenapi import vm_utils
CONF = cfg.CONF
CONF.import_opt('num_retries', 'nova.image.glance', group='glance')
LOG = logging.getLogger(__name__)
class GlanceStore(object):
def _call_glance_plugin(self, session, fn, params):
def _call_glance_plugin(self, context, instance, session, fn, params):
glance_api_servers = glance.get_api_servers()
def pick_glance(kwargs):
@@ -35,8 +41,17 @@ class GlanceStore(object):
kwargs['glance_use_ssl'] = g_use_ssl
return g_host
def retry_cb(context, instance, exc=None):
if exc:
exc_info = sys.exc_info()
LOG.debug(exc.message, exc_info=exc_info)
compute_utils.add_instance_fault_from_exc(
context, instance, exc, exc_info)
cb = functools.partial(retry_cb, context, instance)
return session.call_plugin_serialized_with_retry(
'glance', fn, CONF.glance.num_retries, pick_glance, **params)
'glance', fn, CONF.glance.num_retries, pick_glance, cb, **params)
def _make_params(self, context, session, image_id):
return {'image_id': image_id,
@@ -48,7 +63,8 @@ class GlanceStore(object):
params['uuid_stack'] = vm_utils._make_uuid_stack()
try:
vdis = self._call_glance_plugin(session, 'download_vhd', params)
vdis = self._call_glance_plugin(context, instance, session,
'download_vhd', params)
except exception.PluginRetriesExceeded:
raise exception.CouldNotFetchImage(image_id=image_id)
@@ -72,6 +88,7 @@ class GlanceStore(object):
props["auto_disk_config"] = "disabled"
try:
self._call_glance_plugin(session, 'upload_vhd', params)
self._call_glance_plugin(context, instance, session,
'upload_vhd', params)
except exception.PluginRetriesExceeded:
raise exception.CouldNotUploadImage(image_id=image_id)

View File

@@ -25,9 +25,8 @@ import md5
import socket
import urllib2
import utils
import pluginlib_nova
import utils
pluginlib_nova.configure_logging('glance')
@@ -205,7 +204,7 @@ def _upload_tarball(staging_path, image_id, glance_host, glance_port,
None, staging_path, callback=send_chunked_transfer_encoded,
compression_level=compression_level)
finally:
send_chunked_transfer_encoded('') # Chunked-Transfer terminator
send_chunked_transfer_encoded('') # Chunked-Transfer terminator
bytes_written = callback_data['bytes_written']
logging.info("Wrote %d bytes to %s" % (bytes_written, url))
@@ -218,22 +217,54 @@ def _upload_tarball(staging_path, image_id, glance_host, glance_port,
"Response Status: %i, Response body: %s"
% (url, resp.status, resp.read()))
if resp.status in (httplib.UNAUTHORIZED,
httplib.REQUEST_ENTITY_TOO_LARGE,
httplib.PRECONDITION_FAILED,
httplib.CONFLICT,
httplib.FORBIDDEN,
httplib.INTERNAL_SERVER_ERROR):
# No point in retrying for these conditions
raise PluginError("Got Error response [%i] while uploading "
"image [%s] "
"to glance host [%s:%s]"
# Note(Jesse): This branch sorts errors into those that are permanent,
# those that are ephemeral, and those that are unexpected.
if resp.status in (httplib.BAD_REQUEST, # 400
httplib.UNAUTHORIZED, # 401
httplib.PAYMENT_REQUIRED, # 402
httplib.FORBIDDEN, # 403
httplib.NOT_FOUND, # 404
httplib.METHOD_NOT_ALLOWED, # 405
httplib.NOT_ACCEPTABLE, # 406
httplib.PROXY_AUTHENTICATION_REQUIRED, # 407
httplib.CONFLICT, # 409
httplib.GONE, # 410
httplib.LENGTH_REQUIRED, # 411
httplib.PRECONDITION_FAILED, # 412
httplib.REQUEST_ENTITY_TOO_LARGE, # 413
httplib.REQUEST_URI_TOO_LONG, # 414
httplib.UNSUPPORTED_MEDIA_TYPE, # 415
httplib.REQUESTED_RANGE_NOT_SATISFIABLE, # 416
httplib.EXPECTATION_FAILED, # 417
httplib.UNPROCESSABLE_ENTITY, # 422
httplib.LOCKED, # 423
httplib.FAILED_DEPENDENCY, # 424
httplib.UPGRADE_REQUIRED, # 426
httplib.NOT_IMPLEMENTED, # 501
httplib.HTTP_VERSION_NOT_SUPPORTED, # 505
httplib.NOT_EXTENDED, # 510
):
raise PluginError("Got Permanent Error response [%i] while "
"uploading image [%s] to glance host [%s:%s]"
% (resp.status, image_id,
glance_host, glance_port))
elif resp.status in (httplib.REQUEST_TIMEOUT, # 408
httplib.INTERNAL_SERVER_ERROR, # 500
httplib.BAD_GATEWAY, # 502
httplib.SERVICE_UNAVAILABLE, # 503
httplib.GATEWAY_TIMEOUT, # 504
httplib.INSUFFICIENT_STORAGE, # 507
):
raise RetryableError("Got Ephemeral Error response [%i] while "
"uploading image [%s] to glance host [%s:%s]"
% (resp.status, image_id,
glance_host, glance_port))
else:
raise RetryableError("Unexpected response [%i] while uploading "
"image [%s] "
"to glance host [%s:%s]"
# Note(Jesse): Assume unexpected errors are retryable. If you are
# seeing this error message, the error should probably be added
# to either the ephemeral or permanent error list.
raise RetryableError("Got Unexpected Error response [%i] while "
"uploading image [%s] to glance host [%s:%s]"
% (resp.status, image_id,
glance_host, glance_port))
finally: