From c87c8bf7b850973e8d59ff042723ef314949cb23 Mon Sep 17 00:00:00 2001 From: Edward Hope-Morley Date: Thu, 10 Jul 2025 18:05:30 +0100 Subject: [PATCH] Add Nova mdev initialisation workaround Earlier versions of Nova did not support re-initialisation of domain GPU mdevs such that a node reboot rendered vms unable to boot since their local uuid mismatched with the new host uuids. This patch adds a workaround that installs a systemd service to initialise all used mdevs to match the host and update the Placement API resource providers to match these allocations. Closes-Bug: #1977933 Change-Id: I902c18895679737c4a9dc20b98affdc98af33659 Signed-off-by: Edward Hope-Morley --- charmcraft.yaml | 2 +- files/initialise_nova_mdevs.sh | 6 + src/charm.py | 9 +- src/charm_utils.py | 28 +- templates/remediate_nova_mdevs.py | 348 ++++++++++++++++++++++ templates/systemd-mdev-workaround.service | 12 + tox.ini | 2 +- unit_tests/test_charm_utils.py | 24 +- 8 files changed, 426 insertions(+), 5 deletions(-) create mode 100644 files/initialise_nova_mdevs.sh create mode 100644 templates/remediate_nova_mdevs.py create mode 100644 templates/systemd-mdev-workaround.service diff --git a/charmcraft.yaml b/charmcraft.yaml index 7217725..0fbd75b 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -19,7 +19,7 @@ parts: apt install -y ca-certificates update-ca-certificates -base: ubuntu@24.04 +base: ubuntu@22.04 platforms: amd64: build-on: amd64 diff --git a/files/initialise_nova_mdevs.sh b/files/initialise_nova_mdevs.sh new file mode 100644 index 0000000..1f8cddf --- /dev/null +++ b/files/initialise_nova_mdevs.sh @@ -0,0 +1,6 @@ +#!/bin/bash -e +# Ensure all SRIOV devices have been setup +/usr/lib/nvidia/sriov-manage -e srvio-manage -e ALL +# Now go through all domains and initialise any used mdevs +/opt/remediate-nova-mdevs + diff --git a/src/charm.py b/src/charm.py index b3fde2a..5eeb545 100755 --- a/src/charm.py +++ b/src/charm.py @@ -24,6 +24,7 @@ from charm_utils import ( install_nvidia_software_if_needed, is_nvidia_software_to_be_installed, set_principal_unit_relation_data, + install_mdev_init_workaround, ) from nvidia_utils import list_vgpu_types @@ -41,6 +42,7 @@ class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm): super().register_status_check(self._check_status) self.framework.observe(self.on.config_changed, self._on_config_changed) + self.framework.observe(self.on.upgrade_charm, self._on_upgrade) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.nova_vgpu_relation_joined, self._on_nova_vgpu_relation_joined_or_changed) @@ -67,6 +69,11 @@ class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm): self.update_status() + def _on_upgrade(self, _): + """ upgrade-charm hook.""" + install_mdev_init_workaround(self.config) + self.update_status() + def _on_start(self, _): """start hook.""" # NOTE(lourot): We install software in the `start` hook instead of @@ -78,7 +85,7 @@ class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm): # NOTE(lourot): this is used by OSBaseCharm.update_status(): self._stored.is_started = True - + install_mdev_init_workaround(self.config) self.update_status() def _on_nova_vgpu_relation_joined_or_changed(self, event): diff --git a/src/charm_utils.py b/src/charm_utils.py index 1a6b4f6..f0cd481 100644 --- a/src/charm_utils.py +++ b/src/charm_utils.py @@ -17,6 +17,8 @@ import logging import json +import os +import shutil from ruamel.yaml import YAML @@ -26,7 +28,8 @@ from charmhelpers.contrib.openstack.utils import ( ows_check_services_running, ) from charmhelpers.core.hookenv import cached -from charmhelpers.core.host import file_hash +from charmhelpers.core.host import file_hash, service +from charmhelpers.core.templating import render from charmhelpers.fetch import apt_install from ops.model import ( @@ -272,3 +275,26 @@ def _releases_packages_map(): def _get_current_release(): return get_os_codename_package('nova-common', fatal=False) or 'queens' + + +def install_mdev_init_workaround(config): + logging.info("Installing mdev initialisation workaround.") + shutil.copy('files/initialise_nova_mdevs.sh', + '/opt/initialise_nova_mdevs.sh') + os.chmod('/opt/initialise_nova_mdevs.sh', 0o755) + + vgpu_device_mappings = YAML().load(config.get('vgpu-device-mappings') or + "") + render( + 'remediate_nova_mdevs.py', + '/opt/remediate-nova-mdevs', + {'mdev_types': vgpu_device_mappings}, + perms=0o755) + + render( + 'systemd-mdev-workaround.service', + '/etc/systemd/system/systemd-mdev-workaround.service', + {}, + perms=0o644) + service('enable', 'systemd-mdev-workaround') + # enable but not start since this needs to be done once on boot diff --git a/templates/remediate_nova_mdevs.py b/templates/remediate_nova_mdevs.py new file mode 100644 index 0000000..004c0d3 --- /dev/null +++ b/templates/remediate_nova_mdevs.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging +import os +import socket +from functools import cached_property +from time import sleep +from xml.dom import minidom + +import libvirt +import nova.conf +from nova.utils import get_sdk_adapter +from nova.pci.utils import get_pci_address + +LOG = logging.getLogger(__name__) +CONF = nova.conf.CONF + +NOVA_CONF = '/etc/nova/nova.conf' +# Dictionary of mdev types and and address mappings +MDEV_TYPES = {{ mdev_types }} # noqa pylint: disable=unhashable-member,undefined-variable + + +class PlacementError(Exception): + """ Raised when as error occurs in the PlacementHelper. """ + + +class RemediationFailedError(Exception): + """ Raised when as error occurs during mdev remediation. """ + + +class PlacementHelper(): + """ + Helper for Placement operations. + """ + DRIVER_TRAIT_MAPPING = {'nvidia-610': 'CUSTOM_VGPU_PLACEMENT'} + + def __init__(self): + self.fqdn = socket.getfqdn() + self.client = self._get_sdk_adapter_helper("placement") + if self.client is None: + raise PlacementError("failed to get placement client") + + @staticmethod + def _get_sdk_adapter_helper(service_type): + count = 1 + while True: + LOG.info("fetching %s sdk adapter (attempt=%s)", service_type, + count) + try: + return get_sdk_adapter(service_type) + except Exception as e: + count += 1 + if count > 30: + LOG.error(e) + return None + + LOG.warning("failed to get %s sdk adapter - trying again", + service_type) + sleep(0.5) + + @cached_property + def local_compute_rps(self): + LOG.info("fetching resources providers for host %s", self.fqdn) + rps = self.client.get("/resource_providers") + if rps.status_code != 200: + raise PlacementError(f"failed to get rps: {rps}") + + prefix = f'{self.fqdn}_pci_' + data = rps.json() + result = [] + for rp in data['resource_providers']: + if rp['name'].startswith(prefix): + result.append(rp) + + LOG.info("found %s resources providers for host %s", len(result), + self.fqdn) + return result + + @cached_property + def traits(self): + resp = self.client.get("/traits", microversion='1.6') + if resp.status_code != 200: + raise PlacementError(f"failed to get traits: {resp}") + + _traits = resp.json() + if not _traits: + raise PlacementError("no traits identified from the placement api") + + for trait in self.DRIVER_TRAIT_MAPPING.values(): + if trait not in _traits['traits']: + raise PlacementError(f"trait {trait} not found in placement " + "traits") + + return _traits + + def get_traits_for_rp(self, uuid): + resp = self.client.get(f"/resource_providers/{uuid}/traits", + microversion='1.6') + if resp.status_code != 200: + raise PlacementError(f"failed to traits for rp {uuid}: {resp}") + + return resp.json() + + def update_traits_on_rp(self, uuid, generation, new_traits): + data = { + 'resource_provider_generation': generation, + 'traits': new_traits, + } + resp = self.client.put(f"/resource_providers/{uuid}/traits", + json=data, microversion='1.6') + if resp.status_code != 200: + raise PlacementError(f"failed to update traits for rp {uuid}: " + f"{resp}") + + LOG.info("updated traits on RP %s to %s", uuid, new_traits) + + def get_vgpu_rp_name(self, uuid): + allocations = self.client.get(f"/allocations/{uuid}") + if allocations.status_code != 200: + raise PlacementError(f"failed to get allocation for uuid {uuid}: " + f"{allocations} " + f"(return_code={allocations.status_code})") + + data = allocations.json() + if not data.get('allocations'): + raise PlacementError(f"no allocations found for uuid {uuid}, does " + "instance exist?") + + for rp, rpdata in data['allocations'].items(): + if rpdata.get('resources') is None: + continue + + for r, v in rpdata['resources'].items(): + if r != 'VGPU' or v != 1: + continue + + return self.get_rp_name(rp) + + raise PlacementError(f"no resource provider found for domain {uuid}") + + def get_rp_name(self, uuid): + rp = self.client.get(f"/resource_providers/{uuid}") + if rp.status_code != 200: + raise PlacementError(f"failed to get rp for uuid {uuid} " + f"(return_code={rp.status_code})") + + rp_name = rp.json().get('name') + if not rp_name: + raise PlacementError("failed to find resource provider name for " + f"domain {uuid}") + + return rp_name + + @staticmethod + def get_pci_addr_from_rp_name(rpname): + addr = rpname.split('_pci_')[1] + pci_id_parts = addr.split('_') + return get_pci_address(*pci_id_parts) + + def update_gpu_traits(self, rpname, rpuuid, dry_run=False): + LOG.info("updating gpu traits for resource provider %s", rpuuid) + traits = self.get_traits_for_rp(rpuuid) + if traits is None: + LOG.info("no traits found for resource provider %s " + "- skipping update", rpuuid) + return + + pci_address = self.get_pci_addr_from_rp_name(rpname) + driver = find_driver_type_from_pci_address(pci_address) + if driver is None: + if len(traits['traits']) > 0: + LOG.warning("rp %s for %s has traits %s but should be empty", + rpuuid, pci_address, traits['traits']) + + return + + if driver not in self.DRIVER_TRAIT_MAPPING: + LOG.error("failed to map driver '%s' to a trait for PCI " + "address %s", driver, pci_address) + return + + expected_traits = [self.DRIVER_TRAIT_MAPPING[driver]] + if expected_traits != traits['traits']: + if dry_run: + LOG.warning("rp %s for %s is mapped to driver %s but " + "traits is %s not %s - skipping update since " + "dry_run is True", + rpuuid, pci_address, driver, + traits['traits'], expected_traits) + return + + self.update_traits_on_rp(rpuuid, + traits['resource_provider_generation'], + expected_traits) + + +class LibvirtHelper(): + """ + Helper for Libvirt operations. + """ + + @cached_property + def domains(self): + try: + conn = libvirt.openReadOnly('qemu:///system') + return conn.listAllDomains(0) + finally: + conn.close() + + return None + + @staticmethod + def get_domain_hostdevs(domain): + raw_xml = domain.XMLDesc() + xml = minidom.parseString(raw_xml) + return xml.getElementsByTagName("hostdev") + + +def _mdev_exists(uuid): + path = os.path.join("/sys/bus/mdev/devices", uuid) + return os.path.exists(path) + + +def _create_mdev(pci_addr, driver_type, uuid, dry_run=False): + path = os.path.join('/sys/bus/pci/devices', pci_addr, + 'mdev_supported_types', driver_type, 'create') + LOG.info("creating mdev entry at path: %s", path) + if dry_run: + LOG.info("skipping since dry_run is True") + return + + try: + with open(path, 'w', encoding='utf-8') as f: + f.write(uuid) + except Exception as e: + raise RemediationFailedError(f"failed to create mdev {uuid} at " # noqa pylint: disable=raise-missing-from + f"{pci_addr} with type {driver_type}: " + f"{e}") + + LOG.info("created mdev %s at %s with type %s", uuid, pci_addr, driver_type) + + +def find_driver_type_from_pci_address(pci_addr): + for driver_type, addresses in MDEV_TYPES.items(): # noqa pylint: disable=no-member,undefined-variable + if pci_addr in addresses: + return driver_type + + return None + + +def _remediate_hostdev(pm, domain_uuid, hostdev, dry_run=False): + sources = hostdev.getElementsByTagName("source") + if len(sources) <= 0: + raise RemediationFailedError("found no source elements in hostdev: " + f"{hostdev.toxml()}") + + if len(sources) > 1: + raise RemediationFailedError(f"found more than one ({len(sources)}) " + "source elements in hostdev: " + f"{hostdev.toxml()}") + + source = sources[0] + addresses = source.getElementsByTagName("address") + if len(addresses) != 1: + raise RemediationFailedError(f"expected to find only one address " + f"(({len(addresses)})) in source for " + f"hostdev: {hostdev.toxml()}") + + mdev_uuid = addresses[0].getAttribute("uuid") + if _mdev_exists(mdev_uuid): + LOG.info("hostdev mdev device %s already exists - no action needed", + mdev_uuid) + return + + try: + rp_name = pm.get_vgpu_rp_name(domain_uuid) + except PlacementError as exc: + raise RemediationFailedError from exc + + if '_pci_' not in rp_name: + raise RemediationFailedError("failed to find _pci_ in provider name " + f"'{rp_name}'") + + pci_address = pm.get_pci_addr_from_rp_name(rp_name) + driver_type = find_driver_type_from_pci_address(pci_address) + if not driver_type: + raise RemediationFailedError("failed to find driver type for " + f"pci_address {pci_address}") + + _create_mdev(pci_address, driver_type, mdev_uuid, dry_run) + + +def main(dry_run=False): + logging.basicConfig(level=logging.INFO) + LOG.info("starting Nova mdev remediation (dry_run=%s)", dry_run) + LOG.info("loading Nova config from %s", NOVA_CONF) + CONF(default_config_files=[NOVA_CONF]) + + lm = LibvirtHelper() + pm = PlacementHelper() + + if len(lm.domains) == 0: + LOG.info("no domains found in libvirt - exiting") + return + + LOG.info("%s domains found in libvirt", len(lm.domains)) + failed = False + for domain in lm.domains: + uuid, name = domain.UUIDString(), domain.name() + hostdevs = lm.get_domain_hostdevs(domain) + if len(hostdevs) <= 0: + LOG.info("domain %s (%s) has no hostdevs - skipping", uuid, name) + continue + + LOG.info("domain %s (%s) has %d hostdev(s) - starting remediation", + uuid, name, len(hostdevs)) + for hostdev in hostdevs: + try: + _remediate_hostdev(pm, uuid, hostdev, dry_run) + except RemediationFailedError as exc: + LOG.error(exc) + failed = True + + if not pm.local_compute_rps: + return + + for rp in pm.local_compute_rps: + pm.update_gpu_traits(rp['name'], rp['uuid'], dry_run) + + if failed: + raise PlacementError("failed to update one or more placement traits") + + +if __name__ == '__main__': + main(os.environ.get('MDEV_INIT_DRY_RUN') == 'True') diff --git a/templates/systemd-mdev-workaround.service b/templates/systemd-mdev-workaround.service new file mode 100644 index 0000000..0486a35 --- /dev/null +++ b/templates/systemd-mdev-workaround.service @@ -0,0 +1,12 @@ +[Unit] +Description=GPU MDev Initialisation Workaround for OpenStack Nova +Before=nova-compute.service +After=syslog.target network.target libvirtd.service nvidia-vgpu-mgr.service + +[Service] +Environment="MDEV_INIT_DRY_RUN=False" +Type=oneshot +ExecStart=/bin/bash /opt/initialise_nova_mdevs.sh + +[Install] +WantedBy=multi-user.target diff --git a/tox.ini b/tox.ini index f63f074..558f4a2 100644 --- a/tox.ini +++ b/tox.ini @@ -105,7 +105,7 @@ basepython = python3 deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt -commands = flake8 {posargs} src unit_tests tests +commands = flake8 {posargs} files templates/remediate_nova_mdevs.py src unit_tests tests [testenv:cover] # Technique based heavily upon diff --git a/unit_tests/test_charm_utils.py b/unit_tests/test_charm_utils.py index afa8f01..6353eb3 100644 --- a/unit_tests/test_charm_utils.py +++ b/unit_tests/test_charm_utils.py @@ -15,7 +15,7 @@ import sys import unittest -from mock import ANY, MagicMock, patch +from mock import ANY, MagicMock, patch, call sys.path.append('src') # noqa @@ -250,3 +250,25 @@ class TestCharmUtils(unittest.TestCase): with self.assertRaises(charm_utils.UnsupportedOpenStackRelease): release_codename_mock.return_value = 'pike' charm_utils._nova_conf_sections(vgpu_device_mappings) + + @patch.object(charm_utils, 'service') + @patch.object(charm_utils, 'render') + @patch.object(charm_utils.os, 'chmod') + @patch.object(charm_utils.shutil, 'copy') + def test_install_mdev_init_workaround(self, mock_copy, mock_chmod, + mock_render, mock_service): + charm_config = { + 'vgpu-device-mappings': "{'nvidia-35': ['0000:84:00.0']}" + } + charm_utils.install_mdev_init_workaround(charm_config) + mock_copy.assert_has_calls([call('files/initialise_nova_mdevs.sh', + '/opt/initialise_nova_mdevs.sh')]) + mock_render.assert_has_calls([ + call('remediate_nova_mdevs.py', + '/opt/remediate-nova-mdevs', + {'mdev_types': { + 'nvidia-35': ['0000:84:00.0']}}, + perms=493), + call('systemd-mdev-workaround.service', + '/etc/systemd/system/systemd-mdev-workaround.service', {}, + perms=420)])