From 567dbe1867602d544945b3584c3885ac146b6535 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Thu, 4 Sep 2025 21:42:04 +0100 Subject: [PATCH] hypervisors: Optimize uptime retrieval for better performance The /os-hypervisors/detail API endpoint was experiencing significant performance issues in environments with many compute nodes when using microversion 2.88 or higher, as it made sequential RPC calls to gather uptime information from each compute node. This change optimizes uptime retrieval by: * Adding uptime to periodic resource updates sent by nova-compute to the database, eliminating synchronous RPC calls during API requests * Restricting RPC-based uptime retrieval to hypervisor types that support it (libvirt and z/VM), avoiding unnecessary calls that would always fail * Preferring cached database uptime data over RPC calls when available Closes-Bug: #2122036 Assisted-By: Claude Change-Id: I5723320f578192f7e0beead7d5df5d7e47d54d2b Co-Authored-By: Sylvain Bauza Signed-off-by: Sean Mooney --- api-ref/source/parameters.yaml | 7 ++-- nova/api/openstack/compute/hypervisors.py | 30 ++++++++++------- nova/compute/resource_tracker.py | 7 ++-- nova/compute/stats.py | 6 ++++ .../api/openstack/compute/test_hypervisors.py | 33 +++++++++++++++---- nova/tests/unit/virt/libvirt/test_driver.py | 29 ++++++++-------- nova/tests/unit/virt/zvm/test_driver.py | 7 +++- nova/virt/libvirt/driver.py | 1 + nova/virt/zvm/driver.py | 1 + ...ormance-optimization-6f3a2c8e5d9b1a4e.yaml | 23 +++++++++++++ 10 files changed, 107 insertions(+), 37 deletions(-) create mode 100644 releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index c1cf611fa7e9..283742ef57d8 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -3902,8 +3902,11 @@ hypervisor_type_body: type: string hypervisor_uptime: description: | - The total uptime of the hypervisor and information about average load. Only - reported for active hosts where the virt driver supports this feature. + The response format of this api depends on the virt driver in use on a + given host. The libvirt driver returns the output of the `uptime` command + directly, the z/VM driver returns the `ILP` time. All other drivers + always return `null`. Note this value is cached and updated periodically. + in: body required: true type: string diff --git a/nova/api/openstack/compute/hypervisors.py b/nova/api/openstack/compute/hypervisors.py index 40ad32deabc3..1e6d6bbed8df 100644 --- a/nova/api/openstack/compute/hypervisors.py +++ b/nova/api/openstack/compute/hypervisors.py @@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller): # The 2.88 microversion also *added* the 'uptime' field to the response if detail and api_version_request.is_supported(req, '2.88'): - try: - hyp_dict['uptime'] = self.host_api.get_host_uptime( - req.environ['nova.context'], hypervisor.host) - except ( - NotImplementedError, - exception.ComputeServiceUnavailable, - exception.HostMappingNotFound, - exception.HostNotFound, - ): - # Not all virt drivers support this, and it's not generally - # possible to get uptime for a down host - hyp_dict['uptime'] = None + uptime = None + if "stats" in hypervisor and "uptime" in hypervisor.stats: + uptime = hypervisor.stats.get("uptime") + else: + try: + uptime = self.host_api.get_host_uptime( + req.environ['nova.context'], hypervisor.host) + except ( + NotImplementedError, # only raised in tests + exception.ComputeServiceUnavailable, + exception.HostMappingNotFound, + exception.HostNotFound, + ): + # Only libvirt and ZVM drivers support this, and it's + # not generally possible to get uptime for a down host + pass + + hyp_dict['uptime'] = uptime if servers: hyp_dict['servers'] = [ diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 1e96035cd2f7..f4fcf4da180f 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1173,7 +1173,8 @@ class ResourceTracker(object): "used_disk=%(used_disk)sGB " "total_vcpus=%(total_vcpus)s " "used_vcpus=%(used_vcpus)s " - "pci_stats=%(pci_stats)s", + "pci_stats=%(pci_stats)s " + "stats=%(stats)s", {'node': nodename, 'phys_ram': cn.memory_mb, 'used_ram': cn.memory_mb_used, @@ -1181,7 +1182,9 @@ class ResourceTracker(object): 'used_disk': cn.local_gb_used, 'total_vcpus': tcpu, 'used_vcpus': ucpu, - 'pci_stats': pci_stats}) + 'pci_stats': pci_stats, + 'stats': cn.stats or {} + }) def _resource_change(self, compute_node): """Check to see if any resources have changed.""" diff --git a/nova/compute/stats.py b/nova/compute/stats.py index e9180ec6d6d8..7dab9cc8f875 100644 --- a/nova/compute/stats.py +++ b/nova/compute/stats.py @@ -37,6 +37,12 @@ class Stats(dict): if stats is None: return if isinstance(stats, dict): + # use None as a sentinel to the API that + # the driver does not support uptime + # setdefault will update the dict if and only if + # uptime is not set then return the value. + # since we dont need it we just discard the result + stats.setdefault('uptime', None) self.update(stats) return raise ValueError(_('Unexpected type adding stats')) diff --git a/nova/tests/unit/api/openstack/compute/test_hypervisors.py b/nova/tests/unit/api/openstack/compute/test_hypervisors.py index 6f6f96b39d71..e8a9609c0c3b 100644 --- a/nova/tests/unit/api/openstack/compute/test_hypervisors.py +++ b/nova/tests/unit/api/openstack/compute/test_hypervisors.py @@ -47,7 +47,7 @@ TEST_HYPERS = [ vcpus_used=2, memory_mb_used=5 * 1024, local_gb_used=125, - hypervisor_type="xen", + hypervisor_type="qemu", hypervisor_version=3, hypervisor_hostname="hyper1", free_ram_mb=5 * 1024, @@ -67,7 +67,7 @@ TEST_HYPERS = [ vcpus_used=2, memory_mb_used=5 * 1024, local_gb_used=125, - hypervisor_type="xen", + hypervisor_type="qemu", hypervisor_version=3, hypervisor_hostname="hyper2", free_ram_mb=5 * 1024, @@ -76,7 +76,8 @@ TEST_HYPERS = [ running_vms=2, cpu_info=CPU_INFO, disk_available_least=100, - host_ip=netaddr.IPAddress('2.2.2.2'))] + host_ip=netaddr.IPAddress('2.2.2.2'), + stats={'uptime': 'fake uptime'})] TEST_SERVICES = [ @@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase): del DETAIL_HYPERS_DICTS[1]['host'] del DETAIL_HYPERS_DICTS[0]['uuid'] del DETAIL_HYPERS_DICTS[1]['uuid'] + # Remove stats since it's not exposed in the API response, but preserve + # uptime for v2.88+ tests which expect it + for hyper_dict in DETAIL_HYPERS_DICTS: + if 'stats' in hyper_dict: + del hyper_dict['stats'] DETAIL_HYPERS_DICTS[0].update({'state': 'up', 'status': 'enabled', 'service': dict(id=1, host='compute1', @@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228): 'free_ram_mb': 5120, 'host_ip': netaddr.IPAddress('2.2.2.2'), 'hypervisor_hostname': 'hyper2', - 'hypervisor_type': 'xen', + 'hypervisor_type': 'qemu', 'hypervisor_version': 3, 'id': 2, 'local_gb': 250, @@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228): 'free_ram_mb': 5120, 'host_ip': netaddr.IPAddress('2.2.2.2'), 'hypervisor_hostname': 'hyper2', - 'hypervisor_type': 'xen', + 'hypervisor_type': 'qemu', 'hypervisor_version': 3, 'id': 2, 'local_gb': 250, @@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228): 'free_ram_mb': 5120, 'host_ip': netaddr.IPAddress('2.2.2.2'), 'hypervisor_hostname': 'hyper2', - 'hypervisor_type': 'xen', + 'hypervisor_type': 'qemu', 'hypervisor_version': 3, 'id': 2, 'local_gb': 250, @@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275): # cpu_info is no longer included in the response, so skip this test pass + def test_show_with_uptime_provided_by_compute_node(self): + req = self._get_request(use_admin_context=True) + result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid) + expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1]) + self.assertEqual({'hypervisor': expected_dict}, result) + self.controller.host_api.get_host_uptime.assert_not_called() + + def test_detail_list_uptime(self): + _ = self._test_servers_with_no_servers(self.controller.detail) + # we have simulated that compute 2 is upgraded to store the uptime + # in the stats so we expect 1 call to get the result via RPC + # for compute1 + self.controller.host_api.get_host_uptime.assert_called_with( + mock.ANY, "compute1") + def test_uptime(self): req = self._get_request(True) self.assertRaises( diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index ec5fe68b6450..9110d34b88ad 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -22996,18 +22996,18 @@ class HostStateTestCase(test.NoDBTestCase): drvr = HostStateTestCase.FakeConnection() - stats = drvr.get_available_resource("compute1") - self.assertEqual(stats["vcpus"], 1) - self.assertEqual(stats["memory_mb"], 497) - self.assertEqual(stats["local_gb"], 100) - self.assertEqual(stats["vcpus_used"], 0) - self.assertEqual(stats["memory_mb_used"], 88) - self.assertEqual(stats["local_gb_used"], 20) - self.assertEqual(stats["hypervisor_type"], 'QEMU') - self.assertEqual(stats["hypervisor_version"], + res = drvr.get_available_resource("compute1") + self.assertEqual(res["vcpus"], 1) + self.assertEqual(res["memory_mb"], 497) + self.assertEqual(res["local_gb"], 100) + self.assertEqual(res["vcpus_used"], 0) + self.assertEqual(res["memory_mb_used"], 88) + self.assertEqual(res["local_gb_used"], 20) + self.assertEqual(res["hypervisor_type"], 'QEMU') + self.assertEqual(res["hypervisor_version"], fakelibvirt.FAKE_QEMU_VERSION) - self.assertEqual(stats["hypervisor_hostname"], 'compute1') - cpu_info = jsonutils.loads(stats["cpu_info"]) + self.assertEqual(res["hypervisor_hostname"], 'compute1') + cpu_info = jsonutils.loads(res["cpu_info"]) self.assertEqual(cpu_info, {"vendor": "Intel", "model": "pentium", "arch": fields.Architecture.I686, @@ -23017,12 +23017,13 @@ class HostStateTestCase(test.NoDBTestCase): "topology": {"cores": "1", "threads": "1", "sockets": "1"}, "maxphysaddr": {"mode": "emulate", "bits": "42"} }) - self.assertEqual(stats["disk_available_least"], 80) - self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]), + self.assertEqual(res["disk_available_least"], 80) + self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]), HostStateTestCase.pci_devices) self.assertEqual(objects.NUMATopology.obj_from_db_obj( - stats['numa_topology']), + res['numa_topology']), HostStateTestCase.numa_topology) + self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime()) class TestUpdateProviderTree(test.NoDBTestCase): diff --git a/nova/tests/unit/virt/zvm/test_driver.py b/nova/tests/unit/virt/zvm/test_driver.py index a5a129331d93..66088e455ab3 100644 --- a/nova/tests/unit/virt/zvm/test_driver.py +++ b/nova/tests/unit/virt/zvm/test_driver.py @@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase): self.assertRaises(exception.ZVMDriverException, zvmdriver.ZVMDriver, 'virtapi') + @mock.patch( + 'nova.virt.zvm.driver.ZVMDriver.get_host_uptime', + return_value='IPL at 11/14/17 10:47:44 EST') @mock.patch('nova.virt.zvm.utils.ConnectorClient.call') - def test_get_available_resource_err_case(self, call): + def test_get_available_resource_err_case(self, call, uptime_mock): res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0} call.side_effect = exception.ZVMConnectorError(results=res) results = self._driver.get_available_resource() @@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase): self.assertEqual(0, results['disk_available_least']) self.assertEqual(0, results['hypervisor_version']) self.assertEqual('TESTHOST', results['hypervisor_hostname']) + self.assertEqual(uptime_mock.return_value, results['stats']['uptime']) + uptime_mock.assert_called_once() def test_driver_template_validation(self): self.flags(instance_name_template='abc%6d') diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index ca9c3168d771..8159afc49d62 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -10376,6 +10376,7 @@ class LibvirtDriver(driver.ComputeDriver): else: data['numa_topology'] = None + data['stats'] = {'uptime': self.get_host_uptime()} return data def check_instance_shared_storage_local(self, context, instance): diff --git a/nova/virt/zvm/driver.py b/nova/virt/zvm/driver.py index 4803c18ef84e..ada358026826 100644 --- a/nova/virt/zvm/driver.py +++ b/nova/virt/zvm/driver.py @@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver): obj_fields.HVType.ZVM, obj_fields.VMMode.HVM)], 'numa_topology': None, + 'stats': {'uptime': self.get_host_uptime()} } LOG.debug("Getting available resource for %(host)s:%(nodename)s", diff --git a/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml b/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml new file mode 100644 index 000000000000..7d4fcfe5b57e --- /dev/null +++ b/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml @@ -0,0 +1,23 @@ +--- +fixes: + - | + Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint + when using microversion 2.88 or higher. The API was making sequential RPC + calls to each compute node to gather uptime information, causing significant + delays in environments with many compute nodes (LP#2122036). + + The fix optimizes uptime retrieval by: + + * Adding uptime information to the periodic resource updates sent by + nova-compute to the database, eliminating the need for synchronous RPC + calls during API requests + * Only attempting RPC-based uptime retrieval for hypervisor types that + actually support it (libvirt and z/VM), avoiding unnecessary calls to + other hypervisor types that would always return NotImplementedError + * Preferring cached uptime data from the database over RPC calls when + available, this updates at the cadence specified by + `[DEFAULT]update_resources_interval` which is the same interval the + other hypervisor stats update. + + This change significantly reduces response times for the hypervisor detail + API in large deployments while maintaining backward compatibility.