libvirt: Add a rbd_connect_timeout configurable
Previously the initial call to connect to a RBD cluster via the RADOS API could hang indefinitely if network or other environmental related issues were encountered. When encountered during a call to update_available_resource this can result in the local n-cpu service reporting as UP while never being able to break out of a subsequent RPC timeout loop as documented in bug This change adds a simple timeout configurable to be used when initially connecting to the cluster [1][2][3]. The default timeout of 5 seconds being sufficiently small enough to ensure that if encountered the n-cpu service will be able to be marked as DOWN before a RPC timeout is seen. [1] http://docs.ceph.com/docs/luminous/rados/api/python/#rados.Rados.connect [2] http://docs.ceph.com/docs/mimic/rados/api/python/#rados.Rados.connect [3] http://docs.ceph.com/docs/nautilus/rados/api/python/#rados.Rados.connect Closes-bug: #1834048 Change-Id: I67f341bf895d6cc5d503da274c089d443295199e
This commit is contained in:
@@ -1068,6 +1068,11 @@ the Ceph RBD server.
|
|||||||
cfg.StrOpt('rbd_secret_uuid',
|
cfg.StrOpt('rbd_secret_uuid',
|
||||||
help="""
|
help="""
|
||||||
The libvirt UUID of the secret for the rbd_user volumes.
|
The libvirt UUID of the secret for the rbd_user volumes.
|
||||||
|
"""),
|
||||||
|
cfg.IntOpt('rbd_connect_timeout',
|
||||||
|
default=5,
|
||||||
|
help="""
|
||||||
|
The RADOS client timeout in seconds when initially connecting to the cluster.
|
||||||
"""),
|
"""),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@@ -83,7 +83,9 @@ class RbdTestCase(test.NoDBTestCase):
|
|||||||
self.mock_rbd.ImageHasSnapshots = FakeException
|
self.mock_rbd.ImageHasSnapshots = FakeException
|
||||||
|
|
||||||
self.rbd_pool = 'rbd'
|
self.rbd_pool = 'rbd'
|
||||||
self.driver = rbd_utils.RBDDriver(self.rbd_pool, None, None)
|
self.rbd_connect_timeout = 5
|
||||||
|
self.driver = rbd_utils.RBDDriver(self.rbd_pool, None, None,
|
||||||
|
self.rbd_connect_timeout)
|
||||||
|
|
||||||
self.volume_name = u'volume-00000001'
|
self.volume_name = u'volume-00000001'
|
||||||
self.snap_name = u'test-snap'
|
self.snap_name = u'test-snap'
|
||||||
@@ -276,7 +278,8 @@ class RbdTestCase(test.NoDBTestCase):
|
|||||||
|
|
||||||
def test_connect_to_rados_default(self):
|
def test_connect_to_rados_default(self):
|
||||||
ret = self.driver._connect_to_rados()
|
ret = self.driver._connect_to_rados()
|
||||||
self.assertTrue(self.mock_rados.Rados.connect.called)
|
self.mock_rados.Rados.connect.assert_called_once_with(
|
||||||
|
timeout=self.rbd_connect_timeout)
|
||||||
self.assertTrue(self.mock_rados.Rados.open_ioctx.called)
|
self.assertTrue(self.mock_rados.Rados.open_ioctx.called)
|
||||||
self.assertIsInstance(ret[0], self.mock_rados.Rados)
|
self.assertIsInstance(ret[0], self.mock_rados.Rados)
|
||||||
self.assertEqual(self.mock_rados.Rados.ioctx, ret[1])
|
self.assertEqual(self.mock_rados.Rados.ioctx, ret[1])
|
||||||
@@ -284,7 +287,8 @@ class RbdTestCase(test.NoDBTestCase):
|
|||||||
|
|
||||||
def test_connect_to_rados_different_pool(self):
|
def test_connect_to_rados_different_pool(self):
|
||||||
ret = self.driver._connect_to_rados('alt_pool')
|
ret = self.driver._connect_to_rados('alt_pool')
|
||||||
self.assertTrue(self.mock_rados.Rados.connect.called)
|
self.mock_rados.Rados.connect.assert_called_once_with(
|
||||||
|
timeout=self.rbd_connect_timeout)
|
||||||
self.assertTrue(self.mock_rados.Rados.open_ioctx.called)
|
self.assertTrue(self.mock_rados.Rados.open_ioctx.called)
|
||||||
self.assertIsInstance(ret[0], self.mock_rados.Rados)
|
self.assertIsInstance(ret[0], self.mock_rados.Rados)
|
||||||
self.assertEqual(self.mock_rados.Rados.ioctx, ret[1])
|
self.assertEqual(self.mock_rados.Rados.ioctx, ret[1])
|
||||||
|
@@ -1262,7 +1262,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
return rbd_utils.RBDDriver(
|
return rbd_utils.RBDDriver(
|
||||||
pool=CONF.libvirt.images_rbd_pool,
|
pool=CONF.libvirt.images_rbd_pool,
|
||||||
ceph_conf=CONF.libvirt.images_rbd_ceph_conf,
|
ceph_conf=CONF.libvirt.images_rbd_ceph_conf,
|
||||||
rbd_user=CONF.libvirt.rbd_user)
|
rbd_user=CONF.libvirt.rbd_user,
|
||||||
|
rbd_connect_timeout=CONF.libvirt.rbd_connect_timeout)
|
||||||
|
|
||||||
def _cleanup_rbd(self, instance):
|
def _cleanup_rbd(self, instance):
|
||||||
# NOTE(nic): On revert_resize, the cleanup steps for the root
|
# NOTE(nic): On revert_resize, the cleanup steps for the root
|
||||||
|
@@ -847,6 +847,7 @@ class Rbd(Image):
|
|||||||
|
|
||||||
self.pool = CONF.libvirt.images_rbd_pool
|
self.pool = CONF.libvirt.images_rbd_pool
|
||||||
self.rbd_user = CONF.libvirt.rbd_user
|
self.rbd_user = CONF.libvirt.rbd_user
|
||||||
|
self.rbd_connect_timeout = CONF.libvirt.rbd_connect_timeout
|
||||||
self.ceph_conf = CONF.libvirt.images_rbd_ceph_conf
|
self.ceph_conf = CONF.libvirt.images_rbd_ceph_conf
|
||||||
|
|
||||||
path = 'rbd:%s/%s' % (self.pool, self.rbd_name)
|
path = 'rbd:%s/%s' % (self.pool, self.rbd_name)
|
||||||
@@ -860,7 +861,8 @@ class Rbd(Image):
|
|||||||
self.driver = rbd_utils.RBDDriver(
|
self.driver = rbd_utils.RBDDriver(
|
||||||
pool=self.pool,
|
pool=self.pool,
|
||||||
ceph_conf=self.ceph_conf,
|
ceph_conf=self.ceph_conf,
|
||||||
rbd_user=self.rbd_user)
|
rbd_user=self.rbd_user,
|
||||||
|
rbd_connect_timeout=self.rbd_connect_timeout)
|
||||||
|
|
||||||
self.discard_mode = CONF.libvirt.hw_disk_discard
|
self.discard_mode = CONF.libvirt.hw_disk_discard
|
||||||
|
|
||||||
|
@@ -118,12 +118,13 @@ class RADOSClient(object):
|
|||||||
|
|
||||||
class RBDDriver(object):
|
class RBDDriver(object):
|
||||||
|
|
||||||
def __init__(self, pool, ceph_conf, rbd_user):
|
def __init__(self, pool, ceph_conf, rbd_user, rbd_connect_timeout):
|
||||||
self.pool = pool
|
self.pool = pool
|
||||||
# NOTE(angdraug): rados.Rados fails to connect if ceph_conf is None:
|
# NOTE(angdraug): rados.Rados fails to connect if ceph_conf is None:
|
||||||
# https://github.com/ceph/ceph/pull/1787
|
# https://github.com/ceph/ceph/pull/1787
|
||||||
self.ceph_conf = ceph_conf or ''
|
self.ceph_conf = ceph_conf or ''
|
||||||
self.rbd_user = rbd_user or None
|
self.rbd_user = rbd_user or None
|
||||||
|
self.rbd_connect_timeout = rbd_connect_timeout
|
||||||
if rbd is None:
|
if rbd is None:
|
||||||
raise RuntimeError(_('rbd python libraries not found'))
|
raise RuntimeError(_('rbd python libraries not found'))
|
||||||
|
|
||||||
@@ -131,7 +132,7 @@ class RBDDriver(object):
|
|||||||
client = rados.Rados(rados_id=self.rbd_user,
|
client = rados.Rados(rados_id=self.rbd_user,
|
||||||
conffile=self.ceph_conf)
|
conffile=self.ceph_conf)
|
||||||
try:
|
try:
|
||||||
client.connect()
|
client.connect(timeout=self.rbd_connect_timeout)
|
||||||
pool_to_open = pool or self.pool
|
pool_to_open = pool or self.pool
|
||||||
# NOTE(luogangyi): open_ioctx >= 10.1.0 could handle unicode
|
# NOTE(luogangyi): open_ioctx >= 10.1.0 could handle unicode
|
||||||
# arguments perfectly as part of Python 3 support.
|
# arguments perfectly as part of Python 3 support.
|
||||||
|
12
releasenotes/notes/bug-1834048-8b19ae1c5048b801.yaml
Normal file
12
releasenotes/notes/bug-1834048-8b19ae1c5048b801.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
---
|
||||||
|
other:
|
||||||
|
- |
|
||||||
|
A new ``[libvirt]/rbd_connect_timeout`` configuration option has been
|
||||||
|
introduced to limit the time spent waiting when connecting to a RBD cluster
|
||||||
|
via the RADOS API. This timeout currently defaults to 5 seconds.
|
||||||
|
|
||||||
|
This aims to address issues reported in `bug 1834048`_ where failures to
|
||||||
|
initially connect to a RBD cluster left the nova-compute service inoperable
|
||||||
|
due to constant RPC timeouts being hit.
|
||||||
|
|
||||||
|
.. _bug 1834048: https://bugs.launchpad.net/nova/+bug/1834048
|
Reference in New Issue
Block a user