hypervisors: Optimize uptime retrieval for better performance

The /os-hypervisors/detail API endpoint was experiencing significant
performance issues in environments with many compute nodes when using
microversion 2.88 or higher, as it made sequential RPC calls to gather
uptime information from each compute node.

This change optimizes uptime retrieval by:

* Adding uptime to periodic resource updates sent by nova-compute to the
  database, eliminating synchronous RPC calls during API requests
* Restricting RPC-based uptime retrieval to hypervisor types that support
  it (libvirt and z/VM), avoiding unnecessary calls that would always fail
* Preferring cached database uptime data over RPC calls when available

Closes-Bug: #2122036
Assisted-By: Claude <noreply@anthropic.com>
Change-Id: I5723320f578192f7e0beead7d5df5d7e47d54d2b
Co-Authored-By: Sylvain Bauza <sbauza@redhat.com>
Signed-off-by: Sean Mooney <work@seanmooney.info>
This commit is contained in:
Sean Mooney
2025-09-04 21:42:04 +01:00
parent 9f156aa954
commit 567dbe1867
10 changed files with 107 additions and 37 deletions

View File

@@ -3902,8 +3902,11 @@ hypervisor_type_body:
type: string
hypervisor_uptime:
description: |
The total uptime of the hypervisor and information about average load. Only
reported for active hosts where the virt driver supports this feature.
The response format of this api depends on the virt driver in use on a
given host. The libvirt driver returns the output of the `uptime` command
directly, the z/VM driver returns the `ILP` time. All other drivers
always return `null`. Note this value is cached and updated periodically.
in: body
required: true
type: string

View File

@@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller):
# The 2.88 microversion also *added* the 'uptime' field to the response
if detail and api_version_request.is_supported(req, '2.88'):
try:
hyp_dict['uptime'] = self.host_api.get_host_uptime(
req.environ['nova.context'], hypervisor.host)
except (
NotImplementedError,
exception.ComputeServiceUnavailable,
exception.HostMappingNotFound,
exception.HostNotFound,
):
# Not all virt drivers support this, and it's not generally
# possible to get uptime for a down host
hyp_dict['uptime'] = None
uptime = None
if "stats" in hypervisor and "uptime" in hypervisor.stats:
uptime = hypervisor.stats.get("uptime")
else:
try:
uptime = self.host_api.get_host_uptime(
req.environ['nova.context'], hypervisor.host)
except (
NotImplementedError, # only raised in tests
exception.ComputeServiceUnavailable,
exception.HostMappingNotFound,
exception.HostNotFound,
):
# Only libvirt and ZVM drivers support this, and it's
# not generally possible to get uptime for a down host
pass
hyp_dict['uptime'] = uptime
if servers:
hyp_dict['servers'] = [

View File

@@ -1173,7 +1173,8 @@ class ResourceTracker(object):
"used_disk=%(used_disk)sGB "
"total_vcpus=%(total_vcpus)s "
"used_vcpus=%(used_vcpus)s "
"pci_stats=%(pci_stats)s",
"pci_stats=%(pci_stats)s "
"stats=%(stats)s",
{'node': nodename,
'phys_ram': cn.memory_mb,
'used_ram': cn.memory_mb_used,
@@ -1181,7 +1182,9 @@ class ResourceTracker(object):
'used_disk': cn.local_gb_used,
'total_vcpus': tcpu,
'used_vcpus': ucpu,
'pci_stats': pci_stats})
'pci_stats': pci_stats,
'stats': cn.stats or {}
})
def _resource_change(self, compute_node):
"""Check to see if any resources have changed."""

View File

@@ -37,6 +37,12 @@ class Stats(dict):
if stats is None:
return
if isinstance(stats, dict):
# use None as a sentinel to the API that
# the driver does not support uptime
# setdefault will update the dict if and only if
# uptime is not set then return the value.
# since we dont need it we just discard the result
stats.setdefault('uptime', None)
self.update(stats)
return
raise ValueError(_('Unexpected type adding stats'))

View File

@@ -47,7 +47,7 @@ TEST_HYPERS = [
vcpus_used=2,
memory_mb_used=5 * 1024,
local_gb_used=125,
hypervisor_type="xen",
hypervisor_type="qemu",
hypervisor_version=3,
hypervisor_hostname="hyper1",
free_ram_mb=5 * 1024,
@@ -67,7 +67,7 @@ TEST_HYPERS = [
vcpus_used=2,
memory_mb_used=5 * 1024,
local_gb_used=125,
hypervisor_type="xen",
hypervisor_type="qemu",
hypervisor_version=3,
hypervisor_hostname="hyper2",
free_ram_mb=5 * 1024,
@@ -76,7 +76,8 @@ TEST_HYPERS = [
running_vms=2,
cpu_info=CPU_INFO,
disk_available_least=100,
host_ip=netaddr.IPAddress('2.2.2.2'))]
host_ip=netaddr.IPAddress('2.2.2.2'),
stats={'uptime': 'fake uptime'})]
TEST_SERVICES = [
@@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase):
del DETAIL_HYPERS_DICTS[1]['host']
del DETAIL_HYPERS_DICTS[0]['uuid']
del DETAIL_HYPERS_DICTS[1]['uuid']
# Remove stats since it's not exposed in the API response, but preserve
# uptime for v2.88+ tests which expect it
for hyper_dict in DETAIL_HYPERS_DICTS:
if 'stats' in hyper_dict:
del hyper_dict['stats']
DETAIL_HYPERS_DICTS[0].update({'state': 'up',
'status': 'enabled',
'service': dict(id=1, host='compute1',
@@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
'free_ram_mb': 5120,
'host_ip': netaddr.IPAddress('2.2.2.2'),
'hypervisor_hostname': 'hyper2',
'hypervisor_type': 'xen',
'hypervisor_type': 'qemu',
'hypervisor_version': 3,
'id': 2,
'local_gb': 250,
@@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
'free_ram_mb': 5120,
'host_ip': netaddr.IPAddress('2.2.2.2'),
'hypervisor_hostname': 'hyper2',
'hypervisor_type': 'xen',
'hypervisor_type': 'qemu',
'hypervisor_version': 3,
'id': 2,
'local_gb': 250,
@@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
'free_ram_mb': 5120,
'host_ip': netaddr.IPAddress('2.2.2.2'),
'hypervisor_hostname': 'hyper2',
'hypervisor_type': 'xen',
'hypervisor_type': 'qemu',
'hypervisor_version': 3,
'id': 2,
'local_gb': 250,
@@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275):
# cpu_info is no longer included in the response, so skip this test
pass
def test_show_with_uptime_provided_by_compute_node(self):
req = self._get_request(use_admin_context=True)
result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid)
expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1])
self.assertEqual({'hypervisor': expected_dict}, result)
self.controller.host_api.get_host_uptime.assert_not_called()
def test_detail_list_uptime(self):
_ = self._test_servers_with_no_servers(self.controller.detail)
# we have simulated that compute 2 is upgraded to store the uptime
# in the stats so we expect 1 call to get the result via RPC
# for compute1
self.controller.host_api.get_host_uptime.assert_called_with(
mock.ANY, "compute1")
def test_uptime(self):
req = self._get_request(True)
self.assertRaises(

View File

@@ -22996,18 +22996,18 @@ class HostStateTestCase(test.NoDBTestCase):
drvr = HostStateTestCase.FakeConnection()
stats = drvr.get_available_resource("compute1")
self.assertEqual(stats["vcpus"], 1)
self.assertEqual(stats["memory_mb"], 497)
self.assertEqual(stats["local_gb"], 100)
self.assertEqual(stats["vcpus_used"], 0)
self.assertEqual(stats["memory_mb_used"], 88)
self.assertEqual(stats["local_gb_used"], 20)
self.assertEqual(stats["hypervisor_type"], 'QEMU')
self.assertEqual(stats["hypervisor_version"],
res = drvr.get_available_resource("compute1")
self.assertEqual(res["vcpus"], 1)
self.assertEqual(res["memory_mb"], 497)
self.assertEqual(res["local_gb"], 100)
self.assertEqual(res["vcpus_used"], 0)
self.assertEqual(res["memory_mb_used"], 88)
self.assertEqual(res["local_gb_used"], 20)
self.assertEqual(res["hypervisor_type"], 'QEMU')
self.assertEqual(res["hypervisor_version"],
fakelibvirt.FAKE_QEMU_VERSION)
self.assertEqual(stats["hypervisor_hostname"], 'compute1')
cpu_info = jsonutils.loads(stats["cpu_info"])
self.assertEqual(res["hypervisor_hostname"], 'compute1')
cpu_info = jsonutils.loads(res["cpu_info"])
self.assertEqual(cpu_info,
{"vendor": "Intel", "model": "pentium",
"arch": fields.Architecture.I686,
@@ -23017,12 +23017,13 @@ class HostStateTestCase(test.NoDBTestCase):
"topology": {"cores": "1", "threads": "1", "sockets": "1"},
"maxphysaddr": {"mode": "emulate", "bits": "42"}
})
self.assertEqual(stats["disk_available_least"], 80)
self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]),
self.assertEqual(res["disk_available_least"], 80)
self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]),
HostStateTestCase.pci_devices)
self.assertEqual(objects.NUMATopology.obj_from_db_obj(
stats['numa_topology']),
res['numa_topology']),
HostStateTestCase.numa_topology)
self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime())
class TestUpdateProviderTree(test.NoDBTestCase):

View File

@@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase):
self.assertRaises(exception.ZVMDriverException,
zvmdriver.ZVMDriver, 'virtapi')
@mock.patch(
'nova.virt.zvm.driver.ZVMDriver.get_host_uptime',
return_value='IPL at 11/14/17 10:47:44 EST')
@mock.patch('nova.virt.zvm.utils.ConnectorClient.call')
def test_get_available_resource_err_case(self, call):
def test_get_available_resource_err_case(self, call, uptime_mock):
res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0}
call.side_effect = exception.ZVMConnectorError(results=res)
results = self._driver.get_available_resource()
@@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase):
self.assertEqual(0, results['disk_available_least'])
self.assertEqual(0, results['hypervisor_version'])
self.assertEqual('TESTHOST', results['hypervisor_hostname'])
self.assertEqual(uptime_mock.return_value, results['stats']['uptime'])
uptime_mock.assert_called_once()
def test_driver_template_validation(self):
self.flags(instance_name_template='abc%6d')

View File

@@ -10376,6 +10376,7 @@ class LibvirtDriver(driver.ComputeDriver):
else:
data['numa_topology'] = None
data['stats'] = {'uptime': self.get_host_uptime()}
return data
def check_instance_shared_storage_local(self, context, instance):

View File

@@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver):
obj_fields.HVType.ZVM,
obj_fields.VMMode.HVM)],
'numa_topology': None,
'stats': {'uptime': self.get_host_uptime()}
}
LOG.debug("Getting available resource for %(host)s:%(nodename)s",

View File

@@ -0,0 +1,23 @@
---
fixes:
- |
Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint
when using microversion 2.88 or higher. The API was making sequential RPC
calls to each compute node to gather uptime information, causing significant
delays in environments with many compute nodes (LP#2122036).
The fix optimizes uptime retrieval by:
* Adding uptime information to the periodic resource updates sent by
nova-compute to the database, eliminating the need for synchronous RPC
calls during API requests
* Only attempting RPC-based uptime retrieval for hypervisor types that
actually support it (libvirt and z/VM), avoiding unnecessary calls to
other hypervisor types that would always return NotImplementedError
* Preferring cached uptime data from the database over RPC calls when
available, this updates at the cadence specified by
`[DEFAULT]update_resources_interval` which is the same interval the
other hypervisor stats update.
This change significantly reduces response times for the hypervisor detail
API in large deployments while maintaining backward compatibility.