diff --git a/doc/source/user/flavors.rst b/doc/source/user/flavors.rst index 7fcbe1e745ec..c252e30a16de 100644 --- a/doc/source/user/flavors.rst +++ b/doc/source/user/flavors.rst @@ -495,7 +495,7 @@ PCI NUMA Affinity Policy PCI passthrough devices and neutron SR-IOV interfaces via the ``hw:pci_numa_affinity_policy`` flavor extra spec or ``hw_pci_numa_affinity_policy`` image property. The allowed values are - ``required``,``preferred`` or ``legacy`` (default). + ``required``, ``socket``, ``preferred`` or ``legacy`` (default). **required** This value will mean that nova will boot instances with PCI devices @@ -504,6 +504,25 @@ PCI NUMA Affinity Policy devices could not be determined, those PCI devices wouldn't be consumable by the instance. This provides maximum performance. + **socket** + This means that the PCI device must be affined to the same host socket as + at least one of the guest NUMA nodes. For example, consider a system with + two sockets, each with two NUMA nodes, numbered node 0 and node 1 on + socket 0, and node 2 and node 3 on socket 1. There is a PCI device + affined to node 0. An PCI instance with two guest NUMA nodes and the + ``socket`` policy can be affined to either: + + * node 0 and node 1 + * node 0 and node 2 + * node 0 and node 3 + * node 1 and node 2 + * node 1 and node 3 + + The instance cannot be affined to node 2 and node 3, as neither of those + are on the same socket as the PCI device. If the other nodes are consumed + by other instances and only nodes 2 and 3 are available, the instance + will not boot. + **preferred** This value will mean that ``nova-scheduler`` will choose a compute host with minimal consideration for the NUMA affinity of PCI devices. diff --git a/nova/pci/stats.py b/nova/pci/stats.py index cb36cbadc178..937c130dc8be 100644 --- a/nova/pci/stats.py +++ b/nova/pci/stats.py @@ -298,6 +298,15 @@ class PciDeviceStats(object): pool['count'] for pool in filtered_pools) >= requested_count: return filtered_pools + # the SOCKET policy is a bit of a special case. It's less strict than + # REQUIRED (so REQUIRED will automatically fulfil SOCKET, at least + # with our assumption of never having multiple sockets per NUMA node), + # but not always more strict than LEGACY: a PCI device with no NUMA + # affinity will fulfil LEGACY but not SOCKET. If we have SOCKET, + # process it here and don't continue. + if requested_policy == fields.PCINUMAAffinityPolicy.SOCKET: + return self._filter_pools_for_socket_affinity(pools, numa_cells) + # some systems don't report NUMA node info for PCI devices, in which # case None is reported in 'pci_device.numa_node'. The LEGACY policy # allows us to use these devices so we include None in the list of @@ -323,6 +332,39 @@ class PciDeviceStats(object): return sorted( pools, key=lambda pool: pool.get('numa_node') not in numa_cell_ids) + def _filter_pools_for_socket_affinity(self, pools, numa_cells): + host_cells = self.numa_topology.cells + # bail early if we don't have socket information for all host_cells. + # This could happen if we're running on an weird older system with + # multiple sockets per NUMA node, which is a configuration that we + # explicitly chose not to support. + if any(cell.socket is None for cell in host_cells): + LOG.debug('No socket information in host NUMA cell(s).') + return [] + + # get a set of host sockets that the guest cells are in. Since guest + # cell IDs map to host cell IDs, we can just lookup the latter's + # socket. + socket_ids = set() + for guest_cell in numa_cells: + for host_cell in host_cells: + if guest_cell.id == host_cell.id: + socket_ids.add(host_cell.socket) + + # now get a set of host NUMA nodes that are in the above sockets + allowed_numa_nodes = set() + for host_cell in host_cells: + if host_cell.socket in socket_ids: + allowed_numa_nodes.add(host_cell.id) + + # filter out pools that are not in one of the correct host NUMA nodes. + return [ + pool for pool in pools if any( + utils.pci_device_prop_match(pool, [{'numa_node': numa_node}]) + for numa_node in allowed_numa_nodes + ) + ] + def _filter_pools_for_unrequested_pfs(self, pools, request): """Filter out pools with PFs, unless these are required. @@ -383,8 +425,8 @@ class PciDeviceStats(object): return None # Next, let's exclude all devices that aren't on the correct NUMA node - # *assuming* we have devices and care about that, as determined by - # policy + # or socket, *assuming* we have devices and care about that, as + # determined by policy before_count = after_count pools = self._filter_pools_for_numa_cells(pools, request, numa_cells) after_count = sum([pool['count'] for pool in pools]) diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index 7301868ef759..883532249e23 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -23,6 +23,7 @@ import mock from oslo_config import cfg from oslo_log import log as logging from oslo_serialization import jsonutils +from oslo_utils import units import nova from nova import context @@ -1027,3 +1028,72 @@ class PCIServersWithSRIOVAffinityPoliciesTest(_PCIServersTestBase): group='pci') self._test_policy(pci_numa_node, status, 'required') + + def test_socket_policy_pass(self): + # With 1 socket containing 2 NUMA nodes, make the first node's CPU + # available for pinning, but affine the PCI device to the second node. + # This should pass. + host_info = fakelibvirt.HostInfo( + cpu_nodes=2, cpu_sockets=1, cpu_cores=2, cpu_threads=2, + kB_mem=(16 * units.Gi) // units.Ki) + self.flags(cpu_dedicated_set='0-3', group='compute') + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1) + + self.start_compute(host_info=host_info, pci_info=pci_info) + + extra_spec = { + 'hw:cpu_policy': 'dedicated', + 'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME, + 'hw:pci_numa_affinity_policy': 'socket' + } + flavor_id = self._create_flavor(extra_spec=extra_spec) + self._create_server(flavor_id=flavor_id) + self.assertTrue(self.mock_filter.called) + + def test_socket_policy_fail(self): + # With 2 sockets containing 1 NUMA node each, make the first socket's + # CPUs available for pinning, but affine the PCI device to the second + # NUMA node in the second socket. This should fail. + host_info = fakelibvirt.HostInfo( + cpu_nodes=1, cpu_sockets=2, cpu_cores=2, cpu_threads=2, + kB_mem=(16 * units.Gi) // units.Ki) + self.flags(cpu_dedicated_set='0-3', group='compute') + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=1) + self.start_compute(host_info=host_info, pci_info=pci_info) + + extra_spec = { + 'hw:cpu_policy': 'dedicated', + 'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME, + 'hw:pci_numa_affinity_policy': 'socket' + } + flavor_id = self._create_flavor(extra_spec=extra_spec) + server = self._create_server( + flavor_id=flavor_id, expected_state='ERROR') + self.assertIn('fault', server) + self.assertIn('No valid host', server['fault']['message']) + + def test_socket_policy_multi_numa_pass(self): + # 2 sockets, 2 NUMA nodes each, with the PCI device on NUMA 0 and + # socket 0. If we restrict cpu_dedicated_set to NUMA 1, 2 and 3, we + # should still be able to boot an instance with hw:numa_nodes=3 and the + # `socket` policy, because one of the instance's NUMA nodes will be on + # the same socket as the PCI device (even if there is no direct NUMA + # node affinity). + host_info = fakelibvirt.HostInfo( + cpu_nodes=2, cpu_sockets=2, cpu_cores=2, cpu_threads=1, + kB_mem=(16 * units.Gi) // units.Ki) + self.flags(cpu_dedicated_set='2-7', group='compute') + pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=1, numa_node=0) + + self.start_compute(host_info=host_info, pci_info=pci_info) + + extra_spec = { + 'hw:numa_nodes': '3', + 'hw:cpu_policy': 'dedicated', + 'pci_passthrough:alias': '%s:1' % self.ALIAS_NAME, + 'hw:pci_numa_affinity_policy': 'socket' + } + flavor_id = self._create_flavor(vcpu=6, memory_mb=3144, + extra_spec=extra_spec) + self._create_server(flavor_id=flavor_id) + self.assertTrue(self.mock_filter.called) diff --git a/nova/tests/unit/pci/test_stats.py b/nova/tests/unit/pci/test_stats.py index 4dae4f0ce1c7..c87a171a505a 100644 --- a/nova/tests/unit/pci/test_stats.py +++ b/nova/tests/unit/pci/test_stats.py @@ -97,7 +97,16 @@ class PciDeviceStatsTestCase(test.NoDBTestCase): def setUp(self): super(PciDeviceStatsTestCase, self).setUp() - self.pci_stats = stats.PciDeviceStats(objects.NUMATopology()) + self._setup_pci_stats() + + def _setup_pci_stats(self, numa_topology=None): + """Exists for tests that need to setup pci_stats with a specific NUMA + topology, while still allowing tests that don't care to get the default + "empty" one. + """ + if not numa_topology: + numa_topology = objects.NUMATopology() + self.pci_stats = stats.PciDeviceStats(numa_topology) # The following two calls need to be made before adding the devices. patcher = fakes.fake_pci_whitelist() self.addCleanup(patcher.stop) @@ -229,6 +238,25 @@ class PciDeviceStatsTestCase(test.NoDBTestCase): self.assertFalse(self.pci_stats.support_requests(pci_requests, cells)) + def test_filter_pools_for_socket_affinity_no_socket(self): + self._setup_pci_stats( + objects.NUMATopology( + cells=[objects.NUMACell(socket=None)])) + self.assertEqual( + [], + self.pci_stats._filter_pools_for_socket_affinity( + self.pci_stats.pools, [objects.InstanceNUMACell()])) + + def test_filter_pools_for_socket_affinity(self): + self._setup_pci_stats( + objects.NUMATopology( + cells=[objects.NUMACell(id=1, socket=1)])) + pools = self.pci_stats._filter_pools_for_socket_affinity( + self.pci_stats.pools, [objects.InstanceNUMACell(id=1)]) + self.assertEqual(1, len(pools)) + self.assertEqual('p2', pools[0]['product_id']) + self.assertEqual('v2', pools[0]['vendor_id']) + def test_consume_requests(self): devs = self.pci_stats.consume_requests(pci_requests) self.assertEqual(2, len(devs)) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 961637ec05b4..49914fc0bbb2 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -1210,6 +1210,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, 'COMPUTE_NET_VIF_MODEL_VIRTIO': True, 'COMPUTE_SECURITY_TPM_1_2': False, 'COMPUTE_SECURITY_TPM_2_0': False, + 'COMPUTE_SOCKET_PCI_NUMA_AFFINITY': True, } static_traits = drvr.static_traits @@ -1255,6 +1256,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, 'COMPUTE_NET_VIF_MODEL_VIRTIO': True, 'COMPUTE_SECURITY_TPM_1_2': False, 'COMPUTE_SECURITY_TPM_2_0': False, + 'COMPUTE_SOCKET_PCI_NUMA_AFFINITY': True, } static_traits = drvr.static_traits diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index d48d0ecbad93..0f1739405142 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -8105,6 +8105,7 @@ class LibvirtDriver(driver.ComputeDriver): traits.update(self._get_video_model_traits()) traits.update(self._get_vif_model_traits()) traits.update(self._get_tpm_traits()) + traits.update({ot.COMPUTE_SOCKET_PCI_NUMA_AFFINITY: True}) _, invalid_traits = ot.check_traits(traits) for invalid_trait in invalid_traits: diff --git a/releasenotes/notes/socket-pci-numa-affinity-policy-70b95b57b9f8f0c4.yaml b/releasenotes/notes/socket-pci-numa-affinity-policy-70b95b57b9f8f0c4.yaml new file mode 100644 index 000000000000..16c15a0f3a8c --- /dev/null +++ b/releasenotes/notes/socket-pci-numa-affinity-policy-70b95b57b9f8f0c4.yaml @@ -0,0 +1,11 @@ +--- +features: + - | + A new PCI NUMA affinity policy is available. The + ``hw:pci_numa_affinity_policy`` flavor extra spec and + ``hw_pci_numa_affinity_policy`` image metadata property now accept a + ``socket`` policy value. This value indicates that the PCI device must be + affined to the same host socket as at least one of the guest NUMA nodes. + For more information, see the `PCI Passthrough`__ guide. + + .. __: https://docs.openstack.org/nova/latest/admin/pci-passthrough.html