Install NVIDIA vGPU software
This commit is contained in:
@@ -13,3 +13,11 @@ options:
|
|||||||
and
|
and
|
||||||
https://docs.openstack.org/nova/ussuri/admin/virtual-gpu.html#how-to-discover-a-gpu-type
|
https://docs.openstack.org/nova/ussuri/admin/virtual-gpu.html#how-to-discover-a-gpu-type
|
||||||
for more details.
|
for more details.
|
||||||
|
force-install-nvidia-vgpu:
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
description: |
|
||||||
|
FOR TESTING ONLY. If true, the NVIDIA vGPU software will be installed and
|
||||||
|
set up on all units regardless of the presence of NVIDIA GPU hardware. If
|
||||||
|
false, the software will be installed and set up only on units where that
|
||||||
|
hardware is present.
|
||||||
|
@@ -8,9 +8,7 @@ description: |
|
|||||||
tags:
|
tags:
|
||||||
- openstack
|
- openstack
|
||||||
series:
|
series:
|
||||||
- bionic
|
|
||||||
- focal
|
- focal
|
||||||
- groovy
|
|
||||||
- hirsute
|
- hirsute
|
||||||
- impish
|
- impish
|
||||||
subordinate: true
|
subordinate: true
|
||||||
@@ -22,3 +20,11 @@ requires:
|
|||||||
juju-info:
|
juju-info:
|
||||||
interface: juju-info
|
interface: juju-info
|
||||||
scope: container
|
scope: container
|
||||||
|
resources:
|
||||||
|
nvidia-vgpu-software:
|
||||||
|
type: file
|
||||||
|
filename: nvidia-vgpu.deb
|
||||||
|
description: |
|
||||||
|
Proprietary NVIDIA vGPU host software (to be installed on compute nodes).
|
||||||
|
.
|
||||||
|
See https://docs.nvidia.com/grid/
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
ops
|
ops
|
||||||
|
git+https://opendev.org/openstack/charm-ops-openstack#egg=ops_openstack
|
||||||
|
|
||||||
ruamel.yaml
|
ruamel.yaml
|
||||||
pylspci
|
pylspci
|
||||||
|
156
src/charm.py
156
src/charm.py
@@ -17,34 +17,50 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ops.charm import CharmBase
|
from charmhelpers.core.hookenv import cached
|
||||||
|
from charmhelpers.core.host import file_hash
|
||||||
|
from charmhelpers.fetch import (
|
||||||
|
apt_cache,
|
||||||
|
apt_install,
|
||||||
|
)
|
||||||
|
|
||||||
|
import ops_openstack.plugins.classes
|
||||||
|
|
||||||
from ops.main import main
|
from ops.main import main
|
||||||
from ops.model import ActiveStatus
|
from ops.model import (
|
||||||
|
ActiveStatus,
|
||||||
|
BlockedStatus,
|
||||||
|
ModelError,
|
||||||
|
)
|
||||||
|
|
||||||
from pylspci.parsers import SimpleParser
|
from pylspci.parsers import SimpleParser
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
|
|
||||||
|
|
||||||
class NovaComputeNvidiaVgpuCharm(CharmBase):
|
class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm):
|
||||||
|
|
||||||
|
# NOTE(lourot): as of today (2021-11-25), OSBaseCharm doesn't make use of
|
||||||
|
# this dict's keys (config files) but only uses its values (service names):
|
||||||
|
RESTART_MAP = {
|
||||||
|
'/usr/share/nvidia/vgpu/vgpuConfig.xml': ['nvidia-vgpu-mgr'],
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.framework.observe(self.on.install, self._on_install)
|
super().register_status_check(self.__check_status)
|
||||||
self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)
|
|
||||||
self.framework.observe(self.on.config_changed, self._on_config_changed)
|
self.framework.observe(self.on.config_changed, self._on_config_changed)
|
||||||
|
self.framework.observe(self.on.start, self._on_start)
|
||||||
|
|
||||||
def _on_install(self, _):
|
# hash of the last successfully installed NVIDIA vGPU software passed
|
||||||
"""install hook."""
|
# as resource to the charm:
|
||||||
self.__set_ready_unit_status()
|
self._stored.set_default(last_installed_resource_hash=None)
|
||||||
|
|
||||||
def _on_upgrade_charm(self, _):
|
|
||||||
"""upgrade-charm hook."""
|
|
||||||
self.__set_ready_unit_status()
|
|
||||||
|
|
||||||
def _on_config_changed(self, _):
|
def _on_config_changed(self, _):
|
||||||
"""config-changed hook."""
|
"""config-changed hook."""
|
||||||
if not self._has_nvidia_gpu_hardware():
|
# NOTE(lourot): We want to re-install the software here if a new
|
||||||
return
|
# version has just been provided as a charm resource.
|
||||||
|
self.__install_nvidia_software_if_needed()
|
||||||
|
|
||||||
vgpu_device_mappings_str = self.config.get('vgpu-device-mappings')
|
vgpu_device_mappings_str = self.config.get('vgpu-device-mappings')
|
||||||
if vgpu_device_mappings_str is not None:
|
if vgpu_device_mappings_str is not None:
|
||||||
@@ -52,15 +68,113 @@ class NovaComputeNvidiaVgpuCharm(CharmBase):
|
|||||||
logging.debug('vgpu-device-mappings={}'.format(
|
logging.debug('vgpu-device-mappings={}'.format(
|
||||||
vgpu_device_mappings))
|
vgpu_device_mappings))
|
||||||
|
|
||||||
def __set_ready_unit_status(self):
|
self.update_status()
|
||||||
"""Set the unit status to active/ready."""
|
|
||||||
unit_status_msg = (
|
def _on_start(self, _):
|
||||||
'Unit is ready: '
|
"""start hook."""
|
||||||
+ ('no ' if not self._has_nvidia_gpu_hardware() else '')
|
# NOTE(lourot): We install software in the `start` hook instead of
|
||||||
+ 'NVIDIA GPU found')
|
# the `install` hook because we want to be able to install software
|
||||||
self.unit.status = ActiveStatus(unit_status_msg)
|
# after a reboot if NVIDIA hardware has then been added for the
|
||||||
|
# first time.
|
||||||
|
self.__install_nvidia_software_if_needed()
|
||||||
|
|
||||||
|
# NOTE(lourot): this is used by OSBaseCharm.update_status():
|
||||||
|
self._stored.is_started = True
|
||||||
|
|
||||||
|
self.update_status()
|
||||||
|
|
||||||
|
def services(self):
|
||||||
|
# If no NVIDIA software is expected to be installed on this particular
|
||||||
|
# unit, then no service should be expected to run by
|
||||||
|
# OSBaseCharm.update_status(). Otherwise the services from the
|
||||||
|
# RESTART_MAP are expected to run.
|
||||||
|
if not self.__is_nvidia_software_to_be_installed():
|
||||||
|
return []
|
||||||
|
return super().services()
|
||||||
|
|
||||||
|
def __check_status(self):
|
||||||
|
"""Determine the unit status to be set.
|
||||||
|
|
||||||
|
:rtype: StatusBase
|
||||||
|
"""
|
||||||
|
unit_status_msg = ('no ' if not self._has_nvidia_gpu_hardware()
|
||||||
|
else '') + 'NVIDIA GPU found; '
|
||||||
|
|
||||||
|
installed_versions = self.__installed_nvidia_software_versions()
|
||||||
|
if len(installed_versions) > 0:
|
||||||
|
unit_status_msg += 'installed NVIDIA software: '
|
||||||
|
unit_status_msg += ', '.join(installed_versions)
|
||||||
|
else:
|
||||||
|
unit_status_msg += 'no NVIDIA software installed'
|
||||||
|
|
||||||
|
if self.__is_nvidia_software_to_be_installed() and len(
|
||||||
|
installed_versions) == 0:
|
||||||
|
return BlockedStatus(unit_status_msg)
|
||||||
|
|
||||||
|
return ActiveStatus('Unit is ready: ' + unit_status_msg)
|
||||||
|
|
||||||
|
def __install_nvidia_software_if_needed(self):
|
||||||
|
"""Install the NVIDIA software on this unit if relevant."""
|
||||||
|
if self.__is_nvidia_software_to_be_installed():
|
||||||
|
nvidia_software_path, nvidia_software_hash = (
|
||||||
|
self.__path_and_hash_nvidia_resource())
|
||||||
|
|
||||||
|
if nvidia_software_path is None:
|
||||||
|
# No software has been provided as charm resource. We can't
|
||||||
|
# install anything. OSBaseCharm.update_status() will be
|
||||||
|
# executed later and put the unit in blocked state.
|
||||||
|
return
|
||||||
|
|
||||||
|
last_installed_hash = self._stored.last_installed_resource_hash
|
||||||
|
if nvidia_software_hash == last_installed_hash:
|
||||||
|
logging.info(
|
||||||
|
'NVIDIA vGPU software with hash {} already installed, '
|
||||||
|
'skipping'.format(nvidia_software_hash))
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
'Installing NVIDIA vGPU software with hash {}'.format(
|
||||||
|
nvidia_software_hash))
|
||||||
|
apt_install([nvidia_software_path], fatal=True)
|
||||||
|
self._stored.last_installed_resource_hash = nvidia_software_hash
|
||||||
|
|
||||||
|
@cached
|
||||||
|
def __is_nvidia_software_to_be_installed(self):
|
||||||
|
"""Determine whether the NVIDIA vGPU software is to be installed.
|
||||||
|
|
||||||
|
:returns: True if the software is to be installed and set up on the
|
||||||
|
unit.
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
return (self._has_nvidia_gpu_hardware() or
|
||||||
|
self.config.get('force-install-nvidia-vgpu'))
|
||||||
|
|
||||||
|
def __path_and_hash_nvidia_resource(self):
|
||||||
|
"""Get path to and hash of software provided as charm resource.
|
||||||
|
|
||||||
|
:returns: Pair of path and hash. (None, None) if no charm resource has
|
||||||
|
been provided.
|
||||||
|
:rtype: Tuple[PosixPath, str]
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
nvidia_vgpu_software_path = (
|
||||||
|
self.framework.model.resources.fetch('nvidia-vgpu-software'))
|
||||||
|
except ModelError:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
return nvidia_vgpu_software_path, file_hash(nvidia_vgpu_software_path)
|
||||||
|
|
||||||
|
def __installed_nvidia_software_versions(self):
|
||||||
|
"""Get a list of installed NVIDIA vGPU software versions.
|
||||||
|
|
||||||
|
:returns: List of versions
|
||||||
|
:rtype: List[str]
|
||||||
|
"""
|
||||||
|
return [package['version'] for package in
|
||||||
|
apt_cache().dpkg_list(['nvidia-vgpu-ubuntu-*']).values()]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@cached
|
||||||
def _has_nvidia_gpu_hardware():
|
def _has_nvidia_gpu_hardware():
|
||||||
"""Search for NVIDIA GPU hardware.
|
"""Search for NVIDIA GPU hardware.
|
||||||
|
|
||||||
|
@@ -25,11 +25,11 @@ class TestNovaComputeNvidiaVgpuCharm(unittest.TestCase):
|
|||||||
self.addCleanup(self.harness.cleanup)
|
self.addCleanup(self.harness.cleanup)
|
||||||
self.harness.begin()
|
self.harness.begin()
|
||||||
|
|
||||||
def test_install(self):
|
def test_start(self):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.harness.framework.model.app.name,
|
self.harness.framework.model.app.name,
|
||||||
'nova-compute-nvidia-vgpu')
|
'nova-compute-nvidia-vgpu')
|
||||||
# Test that charm is active upon installation.
|
# Test that charm is active upon installation.
|
||||||
self.harness.charm.on.install.emit()
|
self.harness.charm.on.start.emit()
|
||||||
self.assertTrue(isinstance(
|
self.assertTrue(isinstance(
|
||||||
self.harness.model.unit.status, ActiveStatus))
|
self.harness.model.unit.status, ActiveStatus))
|
||||||
|
Reference in New Issue
Block a user