hypervisors: Optimize uptime retrieval for better performance

The /os-hypervisors/detail API endpoint was experiencing significant performance issues in environments with many compute nodes when using microversion 2.88 or higher, as it made sequential RPC calls to gather uptime information from each compute node. This change optimizes uptime retrieval by: * Adding uptime to periodic resource updates sent by nova-compute to the database, eliminating synchronous RPC calls during API requests * Restricting RPC-based uptime retrieval to hypervisor types that support it (libvirt and z/VM), avoiding unnecessary calls that would always fail * Preferring cached database uptime data over RPC calls when available Closes-Bug: #2122036 Assisted-By: Claude <noreply@anthropic.com> Change-Id: I5723320f578192f7e0beead7d5df5d7e47d54d2b Co-Authored-By: Sylvain Bauza <sbauza@redhat.com> Signed-off-by: Sean Mooney <work@seanmooney.info>
2025-09-04 21:42:04 +01:00
parent 9f156aa954
commit 567dbe1867
10 changed files with 107 additions and 37 deletions
--- a/api-ref/source/parameters.yaml
+++ b/api-ref/source/parameters.yaml
@@ -3902,8 +3902,11 @@ hypervisor_type_body:
  type: string
 hypervisor_uptime:
  description: |
-    The total uptime of the hypervisor and information about average load. Only
-    reported for active hosts where the virt driver supports this feature.
+    The response format of this api depends on the virt driver in use on a
+    given host. The libvirt driver returns the output of the `uptime` command
+    directly, the z/VM driver returns the `ILP` time. All other drivers
+    always return `null`. Note this value is cached and updated periodically.
+
  in: body
  required: true
  type: string
--- a/nova/api/openstack/compute/hypervisors.py
+++ b/nova/api/openstack/compute/hypervisors.py
@@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller):

        # The 2.88 microversion also *added* the 'uptime' field to the response
        if detail and api_version_request.is_supported(req, '2.88'):
-            try:
-                hyp_dict['uptime'] = self.host_api.get_host_uptime(
-                    req.environ['nova.context'], hypervisor.host)
-            except (
-                NotImplementedError,
-                exception.ComputeServiceUnavailable,
-                exception.HostMappingNotFound,
-                exception.HostNotFound,
-            ):
-                # Not all virt drivers support this, and it's not generally
-                # possible to get uptime for a down host
-                hyp_dict['uptime'] = None
+            uptime = None
+            if "stats" in hypervisor and "uptime" in hypervisor.stats:
+                uptime = hypervisor.stats.get("uptime")
+            else:
+                try:
+                    uptime = self.host_api.get_host_uptime(
+                        req.environ['nova.context'], hypervisor.host)
+                except (
+                        NotImplementedError,  # only raised in tests
+                        exception.ComputeServiceUnavailable,
+                        exception.HostMappingNotFound,
+                        exception.HostNotFound,
+                ):
+                    # Only libvirt and ZVM drivers support this, and it's
+                    # not generally possible to get uptime for a down host
+                    pass
+
+            hyp_dict['uptime'] = uptime

        if servers:
            hyp_dict['servers'] = [
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1173,7 +1173,8 @@ class ResourceTracker(object):
                  "used_disk=%(used_disk)sGB "
                  "total_vcpus=%(total_vcpus)s "
                  "used_vcpus=%(used_vcpus)s "
-                  "pci_stats=%(pci_stats)s",
+                  "pci_stats=%(pci_stats)s "
+                  "stats=%(stats)s",
                  {'node': nodename,
                   'phys_ram': cn.memory_mb,
                   'used_ram': cn.memory_mb_used,
@@ -1181,7 +1182,9 @@ class ResourceTracker(object):
                   'used_disk': cn.local_gb_used,
                   'total_vcpus': tcpu,
                   'used_vcpus': ucpu,
-                   'pci_stats': pci_stats})
+                   'pci_stats': pci_stats,
+                   'stats': cn.stats or {}
+                   })

    def _resource_change(self, compute_node):
        """Check to see if any resources have changed."""
--- a/nova/compute/stats.py
+++ b/nova/compute/stats.py
@@ -37,6 +37,12 @@ class Stats(dict):
        if stats is None:
            return
        if isinstance(stats, dict):
+            # use None as a sentinel to the API that
+            # the driver does not support uptime
+            # setdefault will update the dict if and only if
+            # uptime is not set then return the value.
+            # since we dont need it we just discard the result
+            stats.setdefault('uptime', None)
            self.update(stats)
            return
        raise ValueError(_('Unexpected type adding stats'))
--- a/nova/tests/unit/api/openstack/compute/test_hypervisors.py
+++ b/nova/tests/unit/api/openstack/compute/test_hypervisors.py
@@ -47,7 +47,7 @@ TEST_HYPERS = [
         vcpus_used=2,
         memory_mb_used=5 * 1024,
         local_gb_used=125,
-         hypervisor_type="xen",
+         hypervisor_type="qemu",
         hypervisor_version=3,
         hypervisor_hostname="hyper1",
         free_ram_mb=5 * 1024,
@@ -67,7 +67,7 @@ TEST_HYPERS = [
         vcpus_used=2,
         memory_mb_used=5 * 1024,
         local_gb_used=125,
-         hypervisor_type="xen",
+         hypervisor_type="qemu",
         hypervisor_version=3,
         hypervisor_hostname="hyper2",
         free_ram_mb=5 * 1024,
@@ -76,7 +76,8 @@ TEST_HYPERS = [
         running_vms=2,
         cpu_info=CPU_INFO,
         disk_available_least=100,
-         host_ip=netaddr.IPAddress('2.2.2.2'))]
+         host_ip=netaddr.IPAddress('2.2.2.2'),
+         stats={'uptime': 'fake uptime'})]


 TEST_SERVICES = [
@@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase):
    del DETAIL_HYPERS_DICTS[1]['host']
    del DETAIL_HYPERS_DICTS[0]['uuid']
    del DETAIL_HYPERS_DICTS[1]['uuid']
+    # Remove stats since it's not exposed in the API response, but preserve
+    # uptime for v2.88+ tests which expect it
+    for hyper_dict in DETAIL_HYPERS_DICTS:
+        if 'stats' in hyper_dict:
+            del hyper_dict['stats']
    DETAIL_HYPERS_DICTS[0].update({'state': 'up',
                           'status': 'enabled',
                           'service': dict(id=1, host='compute1',
@@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
                'free_ram_mb': 5120,
                'host_ip': netaddr.IPAddress('2.2.2.2'),
                'hypervisor_hostname': 'hyper2',
-                'hypervisor_type': 'xen',
+                'hypervisor_type': 'qemu',
                'hypervisor_version': 3,
                'id': 2,
                'local_gb': 250,
@@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
                'free_ram_mb': 5120,
                'host_ip': netaddr.IPAddress('2.2.2.2'),
                'hypervisor_hostname': 'hyper2',
-                'hypervisor_type': 'xen',
+                'hypervisor_type': 'qemu',
                'hypervisor_version': 3,
                'id': 2,
                'local_gb': 250,
@@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
                'free_ram_mb': 5120,
                'host_ip': netaddr.IPAddress('2.2.2.2'),
                'hypervisor_hostname': 'hyper2',
-                'hypervisor_type': 'xen',
+                'hypervisor_type': 'qemu',
                'hypervisor_version': 3,
                'id': 2,
                'local_gb': 250,
@@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275):
        # cpu_info is no longer included in the response, so skip this test
        pass

+    def test_show_with_uptime_provided_by_compute_node(self):
+        req = self._get_request(use_admin_context=True)
+        result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid)
+        expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1])
+        self.assertEqual({'hypervisor': expected_dict}, result)
+        self.controller.host_api.get_host_uptime.assert_not_called()
+
+    def test_detail_list_uptime(self):
+        _ = self._test_servers_with_no_servers(self.controller.detail)
+        # we have simulated that compute 2 is upgraded to store the uptime
+        # in the stats  so we expect 1 call to get the result via RPC
+        # for compute1
+        self.controller.host_api.get_host_uptime.assert_called_with(
+            mock.ANY, "compute1")
+
    def test_uptime(self):
        req = self._get_request(True)
        self.assertRaises(
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -22996,18 +22996,18 @@ class HostStateTestCase(test.NoDBTestCase):

        drvr = HostStateTestCase.FakeConnection()

-        stats = drvr.get_available_resource("compute1")
-        self.assertEqual(stats["vcpus"], 1)
-        self.assertEqual(stats["memory_mb"], 497)
-        self.assertEqual(stats["local_gb"], 100)
-        self.assertEqual(stats["vcpus_used"], 0)
-        self.assertEqual(stats["memory_mb_used"], 88)
-        self.assertEqual(stats["local_gb_used"], 20)
-        self.assertEqual(stats["hypervisor_type"], 'QEMU')
-        self.assertEqual(stats["hypervisor_version"],
+        res = drvr.get_available_resource("compute1")
+        self.assertEqual(res["vcpus"], 1)
+        self.assertEqual(res["memory_mb"], 497)
+        self.assertEqual(res["local_gb"], 100)
+        self.assertEqual(res["vcpus_used"], 0)
+        self.assertEqual(res["memory_mb_used"], 88)
+        self.assertEqual(res["local_gb_used"], 20)
+        self.assertEqual(res["hypervisor_type"], 'QEMU')
+        self.assertEqual(res["hypervisor_version"],
                         fakelibvirt.FAKE_QEMU_VERSION)
-        self.assertEqual(stats["hypervisor_hostname"], 'compute1')
-        cpu_info = jsonutils.loads(stats["cpu_info"])
+        self.assertEqual(res["hypervisor_hostname"], 'compute1')
+        cpu_info = jsonutils.loads(res["cpu_info"])
        self.assertEqual(cpu_info,
                {"vendor": "Intel", "model": "pentium",
                 "arch": fields.Architecture.I686,
@@ -23017,12 +23017,13 @@ class HostStateTestCase(test.NoDBTestCase):
                 "topology": {"cores": "1", "threads": "1", "sockets": "1"},
                 "maxphysaddr": {"mode": "emulate", "bits": "42"}
                })
-        self.assertEqual(stats["disk_available_least"], 80)
-        self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]),
+        self.assertEqual(res["disk_available_least"], 80)
+        self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]),
                         HostStateTestCase.pci_devices)
        self.assertEqual(objects.NUMATopology.obj_from_db_obj(
-                            stats['numa_topology']),
+                            res['numa_topology']),
                         HostStateTestCase.numa_topology)
+        self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime())


 class TestUpdateProviderTree(test.NoDBTestCase):
--- a/nova/tests/unit/virt/zvm/test_driver.py
+++ b/nova/tests/unit/virt/zvm/test_driver.py
@@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase):
        self.assertRaises(exception.ZVMDriverException,
                          zvmdriver.ZVMDriver, 'virtapi')

+    @mock.patch(
+        'nova.virt.zvm.driver.ZVMDriver.get_host_uptime',
+        return_value='IPL at 11/14/17 10:47:44 EST')
    @mock.patch('nova.virt.zvm.utils.ConnectorClient.call')
-    def test_get_available_resource_err_case(self, call):
+    def test_get_available_resource_err_case(self, call, uptime_mock):
        res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0}
        call.side_effect = exception.ZVMConnectorError(results=res)
        results = self._driver.get_available_resource()
@@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase):
        self.assertEqual(0, results['disk_available_least'])
        self.assertEqual(0, results['hypervisor_version'])
        self.assertEqual('TESTHOST', results['hypervisor_hostname'])
+        self.assertEqual(uptime_mock.return_value, results['stats']['uptime'])
+        uptime_mock.assert_called_once()

    def test_driver_template_validation(self):
        self.flags(instance_name_template='abc%6d')
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -10376,6 +10376,7 @@ class LibvirtDriver(driver.ComputeDriver):
        else:
            data['numa_topology'] = None

+        data['stats'] = {'uptime': self.get_host_uptime()}
        return data

    def check_instance_shared_storage_local(self, context, instance):
--- a/nova/virt/zvm/driver.py
+++ b/nova/virt/zvm/driver.py
@@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver):
                                     obj_fields.HVType.ZVM,
                                     obj_fields.VMMode.HVM)],
            'numa_topology': None,
+            'stats': {'uptime': self.get_host_uptime()}
        }

        LOG.debug("Getting available resource for %(host)s:%(nodename)s",
--- a/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml
+++ b/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml
@@ -0,0 +1,23 @@
+---
+fixes:
+  - |
+    Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint
+    when using microversion 2.88 or higher. The API was making sequential RPC
+    calls to each compute node to gather uptime information, causing significant
+    delays in environments with many compute nodes (LP#2122036).
+
+    The fix optimizes uptime retrieval by:
+
+    * Adding uptime information to the periodic resource updates sent by
+      nova-compute to the database, eliminating the need for synchronous RPC
+      calls during API requests
+    * Only attempting RPC-based uptime retrieval for hypervisor types that
+      actually support it (libvirt and z/VM), avoiding unnecessary calls to
+      other hypervisor types that would always return NotImplementedError
+    * Preferring cached uptime data from the database over RPC calls when
+      available, this updates at the cadence specified by
+      `[DEFAULT]update_resources_interval` which is the same interval the
+      other hypervisor stats update.
+
+    This change significantly reduces response times for the hypervisor detail
+    API in large deployments while maintaining backward compatibility.