hypervisors: Optimize uptime retrieval for better performance
The /os-hypervisors/detail API endpoint was experiencing significant performance issues in environments with many compute nodes when using microversion 2.88 or higher, as it made sequential RPC calls to gather uptime information from each compute node. This change optimizes uptime retrieval by: * Adding uptime to periodic resource updates sent by nova-compute to the database, eliminating synchronous RPC calls during API requests * Restricting RPC-based uptime retrieval to hypervisor types that support it (libvirt and z/VM), avoiding unnecessary calls that would always fail * Preferring cached database uptime data over RPC calls when available Closes-Bug: #2122036 Assisted-By: Claude <noreply@anthropic.com> Change-Id: I5723320f578192f7e0beead7d5df5d7e47d54d2b Co-Authored-By: Sylvain Bauza <sbauza@redhat.com> Signed-off-by: Sean Mooney <work@seanmooney.info>
This commit is contained in:
@@ -3902,8 +3902,11 @@ hypervisor_type_body:
|
||||
type: string
|
||||
hypervisor_uptime:
|
||||
description: |
|
||||
The total uptime of the hypervisor and information about average load. Only
|
||||
reported for active hosts where the virt driver supports this feature.
|
||||
The response format of this api depends on the virt driver in use on a
|
||||
given host. The libvirt driver returns the output of the `uptime` command
|
||||
directly, the z/VM driver returns the `ILP` time. All other drivers
|
||||
always return `null`. Note this value is cached and updated periodically.
|
||||
|
||||
in: body
|
||||
required: true
|
||||
type: string
|
||||
|
@@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller):
|
||||
|
||||
# The 2.88 microversion also *added* the 'uptime' field to the response
|
||||
if detail and api_version_request.is_supported(req, '2.88'):
|
||||
try:
|
||||
hyp_dict['uptime'] = self.host_api.get_host_uptime(
|
||||
req.environ['nova.context'], hypervisor.host)
|
||||
except (
|
||||
NotImplementedError,
|
||||
exception.ComputeServiceUnavailable,
|
||||
exception.HostMappingNotFound,
|
||||
exception.HostNotFound,
|
||||
):
|
||||
# Not all virt drivers support this, and it's not generally
|
||||
# possible to get uptime for a down host
|
||||
hyp_dict['uptime'] = None
|
||||
uptime = None
|
||||
if "stats" in hypervisor and "uptime" in hypervisor.stats:
|
||||
uptime = hypervisor.stats.get("uptime")
|
||||
else:
|
||||
try:
|
||||
uptime = self.host_api.get_host_uptime(
|
||||
req.environ['nova.context'], hypervisor.host)
|
||||
except (
|
||||
NotImplementedError, # only raised in tests
|
||||
exception.ComputeServiceUnavailable,
|
||||
exception.HostMappingNotFound,
|
||||
exception.HostNotFound,
|
||||
):
|
||||
# Only libvirt and ZVM drivers support this, and it's
|
||||
# not generally possible to get uptime for a down host
|
||||
pass
|
||||
|
||||
hyp_dict['uptime'] = uptime
|
||||
|
||||
if servers:
|
||||
hyp_dict['servers'] = [
|
||||
|
@@ -1173,7 +1173,8 @@ class ResourceTracker(object):
|
||||
"used_disk=%(used_disk)sGB "
|
||||
"total_vcpus=%(total_vcpus)s "
|
||||
"used_vcpus=%(used_vcpus)s "
|
||||
"pci_stats=%(pci_stats)s",
|
||||
"pci_stats=%(pci_stats)s "
|
||||
"stats=%(stats)s",
|
||||
{'node': nodename,
|
||||
'phys_ram': cn.memory_mb,
|
||||
'used_ram': cn.memory_mb_used,
|
||||
@@ -1181,7 +1182,9 @@ class ResourceTracker(object):
|
||||
'used_disk': cn.local_gb_used,
|
||||
'total_vcpus': tcpu,
|
||||
'used_vcpus': ucpu,
|
||||
'pci_stats': pci_stats})
|
||||
'pci_stats': pci_stats,
|
||||
'stats': cn.stats or {}
|
||||
})
|
||||
|
||||
def _resource_change(self, compute_node):
|
||||
"""Check to see if any resources have changed."""
|
||||
|
@@ -37,6 +37,12 @@ class Stats(dict):
|
||||
if stats is None:
|
||||
return
|
||||
if isinstance(stats, dict):
|
||||
# use None as a sentinel to the API that
|
||||
# the driver does not support uptime
|
||||
# setdefault will update the dict if and only if
|
||||
# uptime is not set then return the value.
|
||||
# since we dont need it we just discard the result
|
||||
stats.setdefault('uptime', None)
|
||||
self.update(stats)
|
||||
return
|
||||
raise ValueError(_('Unexpected type adding stats'))
|
||||
|
@@ -47,7 +47,7 @@ TEST_HYPERS = [
|
||||
vcpus_used=2,
|
||||
memory_mb_used=5 * 1024,
|
||||
local_gb_used=125,
|
||||
hypervisor_type="xen",
|
||||
hypervisor_type="qemu",
|
||||
hypervisor_version=3,
|
||||
hypervisor_hostname="hyper1",
|
||||
free_ram_mb=5 * 1024,
|
||||
@@ -67,7 +67,7 @@ TEST_HYPERS = [
|
||||
vcpus_used=2,
|
||||
memory_mb_used=5 * 1024,
|
||||
local_gb_used=125,
|
||||
hypervisor_type="xen",
|
||||
hypervisor_type="qemu",
|
||||
hypervisor_version=3,
|
||||
hypervisor_hostname="hyper2",
|
||||
free_ram_mb=5 * 1024,
|
||||
@@ -76,7 +76,8 @@ TEST_HYPERS = [
|
||||
running_vms=2,
|
||||
cpu_info=CPU_INFO,
|
||||
disk_available_least=100,
|
||||
host_ip=netaddr.IPAddress('2.2.2.2'))]
|
||||
host_ip=netaddr.IPAddress('2.2.2.2'),
|
||||
stats={'uptime': 'fake uptime'})]
|
||||
|
||||
|
||||
TEST_SERVICES = [
|
||||
@@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase):
|
||||
del DETAIL_HYPERS_DICTS[1]['host']
|
||||
del DETAIL_HYPERS_DICTS[0]['uuid']
|
||||
del DETAIL_HYPERS_DICTS[1]['uuid']
|
||||
# Remove stats since it's not exposed in the API response, but preserve
|
||||
# uptime for v2.88+ tests which expect it
|
||||
for hyper_dict in DETAIL_HYPERS_DICTS:
|
||||
if 'stats' in hyper_dict:
|
||||
del hyper_dict['stats']
|
||||
DETAIL_HYPERS_DICTS[0].update({'state': 'up',
|
||||
'status': 'enabled',
|
||||
'service': dict(id=1, host='compute1',
|
||||
@@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
|
||||
'free_ram_mb': 5120,
|
||||
'host_ip': netaddr.IPAddress('2.2.2.2'),
|
||||
'hypervisor_hostname': 'hyper2',
|
||||
'hypervisor_type': 'xen',
|
||||
'hypervisor_type': 'qemu',
|
||||
'hypervisor_version': 3,
|
||||
'id': 2,
|
||||
'local_gb': 250,
|
||||
@@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
|
||||
'free_ram_mb': 5120,
|
||||
'host_ip': netaddr.IPAddress('2.2.2.2'),
|
||||
'hypervisor_hostname': 'hyper2',
|
||||
'hypervisor_type': 'xen',
|
||||
'hypervisor_type': 'qemu',
|
||||
'hypervisor_version': 3,
|
||||
'id': 2,
|
||||
'local_gb': 250,
|
||||
@@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
|
||||
'free_ram_mb': 5120,
|
||||
'host_ip': netaddr.IPAddress('2.2.2.2'),
|
||||
'hypervisor_hostname': 'hyper2',
|
||||
'hypervisor_type': 'xen',
|
||||
'hypervisor_type': 'qemu',
|
||||
'hypervisor_version': 3,
|
||||
'id': 2,
|
||||
'local_gb': 250,
|
||||
@@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275):
|
||||
# cpu_info is no longer included in the response, so skip this test
|
||||
pass
|
||||
|
||||
def test_show_with_uptime_provided_by_compute_node(self):
|
||||
req = self._get_request(use_admin_context=True)
|
||||
result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid)
|
||||
expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1])
|
||||
self.assertEqual({'hypervisor': expected_dict}, result)
|
||||
self.controller.host_api.get_host_uptime.assert_not_called()
|
||||
|
||||
def test_detail_list_uptime(self):
|
||||
_ = self._test_servers_with_no_servers(self.controller.detail)
|
||||
# we have simulated that compute 2 is upgraded to store the uptime
|
||||
# in the stats so we expect 1 call to get the result via RPC
|
||||
# for compute1
|
||||
self.controller.host_api.get_host_uptime.assert_called_with(
|
||||
mock.ANY, "compute1")
|
||||
|
||||
def test_uptime(self):
|
||||
req = self._get_request(True)
|
||||
self.assertRaises(
|
||||
|
@@ -22996,18 +22996,18 @@ class HostStateTestCase(test.NoDBTestCase):
|
||||
|
||||
drvr = HostStateTestCase.FakeConnection()
|
||||
|
||||
stats = drvr.get_available_resource("compute1")
|
||||
self.assertEqual(stats["vcpus"], 1)
|
||||
self.assertEqual(stats["memory_mb"], 497)
|
||||
self.assertEqual(stats["local_gb"], 100)
|
||||
self.assertEqual(stats["vcpus_used"], 0)
|
||||
self.assertEqual(stats["memory_mb_used"], 88)
|
||||
self.assertEqual(stats["local_gb_used"], 20)
|
||||
self.assertEqual(stats["hypervisor_type"], 'QEMU')
|
||||
self.assertEqual(stats["hypervisor_version"],
|
||||
res = drvr.get_available_resource("compute1")
|
||||
self.assertEqual(res["vcpus"], 1)
|
||||
self.assertEqual(res["memory_mb"], 497)
|
||||
self.assertEqual(res["local_gb"], 100)
|
||||
self.assertEqual(res["vcpus_used"], 0)
|
||||
self.assertEqual(res["memory_mb_used"], 88)
|
||||
self.assertEqual(res["local_gb_used"], 20)
|
||||
self.assertEqual(res["hypervisor_type"], 'QEMU')
|
||||
self.assertEqual(res["hypervisor_version"],
|
||||
fakelibvirt.FAKE_QEMU_VERSION)
|
||||
self.assertEqual(stats["hypervisor_hostname"], 'compute1')
|
||||
cpu_info = jsonutils.loads(stats["cpu_info"])
|
||||
self.assertEqual(res["hypervisor_hostname"], 'compute1')
|
||||
cpu_info = jsonutils.loads(res["cpu_info"])
|
||||
self.assertEqual(cpu_info,
|
||||
{"vendor": "Intel", "model": "pentium",
|
||||
"arch": fields.Architecture.I686,
|
||||
@@ -23017,12 +23017,13 @@ class HostStateTestCase(test.NoDBTestCase):
|
||||
"topology": {"cores": "1", "threads": "1", "sockets": "1"},
|
||||
"maxphysaddr": {"mode": "emulate", "bits": "42"}
|
||||
})
|
||||
self.assertEqual(stats["disk_available_least"], 80)
|
||||
self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]),
|
||||
self.assertEqual(res["disk_available_least"], 80)
|
||||
self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]),
|
||||
HostStateTestCase.pci_devices)
|
||||
self.assertEqual(objects.NUMATopology.obj_from_db_obj(
|
||||
stats['numa_topology']),
|
||||
res['numa_topology']),
|
||||
HostStateTestCase.numa_topology)
|
||||
self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime())
|
||||
|
||||
|
||||
class TestUpdateProviderTree(test.NoDBTestCase):
|
||||
|
@@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase):
|
||||
self.assertRaises(exception.ZVMDriverException,
|
||||
zvmdriver.ZVMDriver, 'virtapi')
|
||||
|
||||
@mock.patch(
|
||||
'nova.virt.zvm.driver.ZVMDriver.get_host_uptime',
|
||||
return_value='IPL at 11/14/17 10:47:44 EST')
|
||||
@mock.patch('nova.virt.zvm.utils.ConnectorClient.call')
|
||||
def test_get_available_resource_err_case(self, call):
|
||||
def test_get_available_resource_err_case(self, call, uptime_mock):
|
||||
res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0}
|
||||
call.side_effect = exception.ZVMConnectorError(results=res)
|
||||
results = self._driver.get_available_resource()
|
||||
@@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase):
|
||||
self.assertEqual(0, results['disk_available_least'])
|
||||
self.assertEqual(0, results['hypervisor_version'])
|
||||
self.assertEqual('TESTHOST', results['hypervisor_hostname'])
|
||||
self.assertEqual(uptime_mock.return_value, results['stats']['uptime'])
|
||||
uptime_mock.assert_called_once()
|
||||
|
||||
def test_driver_template_validation(self):
|
||||
self.flags(instance_name_template='abc%6d')
|
||||
|
@@ -10376,6 +10376,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
else:
|
||||
data['numa_topology'] = None
|
||||
|
||||
data['stats'] = {'uptime': self.get_host_uptime()}
|
||||
return data
|
||||
|
||||
def check_instance_shared_storage_local(self, context, instance):
|
||||
|
@@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver):
|
||||
obj_fields.HVType.ZVM,
|
||||
obj_fields.VMMode.HVM)],
|
||||
'numa_topology': None,
|
||||
'stats': {'uptime': self.get_host_uptime()}
|
||||
}
|
||||
|
||||
LOG.debug("Getting available resource for %(host)s:%(nodename)s",
|
||||
|
@@ -0,0 +1,23 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint
|
||||
when using microversion 2.88 or higher. The API was making sequential RPC
|
||||
calls to each compute node to gather uptime information, causing significant
|
||||
delays in environments with many compute nodes (LP#2122036).
|
||||
|
||||
The fix optimizes uptime retrieval by:
|
||||
|
||||
* Adding uptime information to the periodic resource updates sent by
|
||||
nova-compute to the database, eliminating the need for synchronous RPC
|
||||
calls during API requests
|
||||
* Only attempting RPC-based uptime retrieval for hypervisor types that
|
||||
actually support it (libvirt and z/VM), avoiding unnecessary calls to
|
||||
other hypervisor types that would always return NotImplementedError
|
||||
* Preferring cached uptime data from the database over RPC calls when
|
||||
available, this updates at the cadence specified by
|
||||
`[DEFAULT]update_resources_interval` which is the same interval the
|
||||
other hypervisor stats update.
|
||||
|
||||
This change significantly reduces response times for the hypervisor detail
|
||||
API in large deployments while maintaining backward compatibility.
|
Reference in New Issue
Block a user