diff --git a/nova/objects/pci_device.py b/nova/objects/pci_device.py index b675641a06db..b0d5b75826b4 100644 --- a/nova/objects/pci_device.py +++ b/nova/objects/pci_device.py @@ -346,10 +346,40 @@ class PciDevice(base.NovaPersistentObject, base.NovaObject): # Update PF status to CLAIMED if all of it dependants are free # and set their status to UNCLAIMABLE vfs_list = self.child_devices - if not all([vf.is_available() for vf in vfs_list]): - raise exception.PciDeviceVFInvalidStatus( - compute_node_id=self.compute_node_id, - address=self.address) + non_free_dependants = [ + vf for vf in vfs_list if not vf.is_available()] + if non_free_dependants: + # NOTE(gibi): There should not be any dependent devices that + # are UNCLAIMABLE or UNAVAILABLE as the parent is AVAILABLE, + # but we got reports in bug 1969496 that this inconsistency + # can happen. So check if the only non-free devices are in + # state UNCLAIMABLE or UNAVAILABLE then we log a warning but + # allow to claim the parent. + actual_statuses = { + child.status for child in non_free_dependants} + allowed_non_free_statues = { + fields.PciDeviceStatus.UNCLAIMABLE, + fields.PciDeviceStatus.UNAVAILABLE, + } + if actual_statuses - allowed_non_free_statues == set(): + LOG.warning( + "Some child device of parent %s is in an inconsistent " + "state. If you can reproduce this warning then please " + "report a bug at " + "https://bugs.launchpad.net/nova/+filebug with " + "reproduction steps. Inconsistent children with " + "state: %s", + self.address, + ",".join( + "%s - %s" % (child.address, child.status) + for child in non_free_dependants + ), + ) + + else: + raise exception.PciDeviceVFInvalidStatus( + compute_node_id=self.compute_node_id, + address=self.address) self._bulk_update_status(vfs_list, fields.PciDeviceStatus.UNCLAIMABLE) diff --git a/nova/pci/stats.py b/nova/pci/stats.py index c8dda84d4bfb..6a53c43c787f 100644 --- a/nova/pci/stats.py +++ b/nova/pci/stats.py @@ -279,8 +279,12 @@ class PciDeviceStats(object): if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF: vfs_list = pci_dev.child_devices if vfs_list: + free_devs = self.get_free_devs() for vf in vfs_list: - self.remove_device(vf) + # NOTE(gibi): do not try to remove a device that are + # already removed + if vf in free_devs: + self.remove_device(vf) elif pci_dev.dev_type in ( fields.PciDeviceType.SRIOV_VF, fields.PciDeviceType.VDPA, diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py index a924d71ef400..6682b3e3b216 100644 --- a/nova/tests/unit/pci/test_manager.py +++ b/nova/tests/unit/pci/test_manager.py @@ -507,32 +507,134 @@ class PciDevTrackerTestCase(test.NoDBTestCase): ) # now try to claim and allocate the PF. It should work as it is # available - # This is bug 1969496 as the claim fails with exception - ex = self.assertRaises( - exception.PciDevicePoolEmpty, - self.tracker.claim_instance, - mock.sentinel.context, - pci_requests_obj, - None - ) - self.assertIn( - 'Attempt to consume PCI device 1:0000:00:02.1 from empty pool', - str(ex) - ) + self.tracker.claim_instance( + mock.sentinel.context, pci_requests_obj, None) + self.tracker.allocate_instance({'uuid': uuidsentinel.instance1}) + pf_dev = self._get_device_by_address(pf['address']) - self.assertEqual('available', pf_dev.status) + self.assertEqual('allocated', pf_dev.status) vf_dev = self._get_device_by_address(vf['address']) self.assertEqual('unavailable', vf_dev.status) - # This should work when the bug is fixed - # self.tracker.claim_instance( - # mock.sentinel.context, pci_requests_obj, None) - # self.tracker.allocate_instance({'uuid': uuidsentinel.instance1}) + self.assertIn( + 'Some child device of parent 0000:00:01.1 is in an inconsistent ' + 'state. If you can reproduce this warning then please report a ' + 'bug at https://bugs.launchpad.net/nova/+filebug with ' + 'reproduction steps. Inconsistent children with state: ' + '0000:00:02.1 - unavailable', + self.stdlog.logger.output + ) - # pf_dev = self._get_device_by_address(pf['address']) - # self.assertEqual('allocated', pf_dev.status) - # vf_dev = self._get_device_by_address(vf['address']) - # self.assertEqual('unavailable', vf_dev.status) + # Ensure that the claim actually fixes the inconsistency so when the + # parent if freed the children become available too. + self.tracker.free_instance( + mock.sentinel.context, {'uuid': uuidsentinel.instance1}) + + pf_dev = self._get_device_by_address(pf['address']) + self.assertEqual('available', pf_dev.status) + vf_dev = self._get_device_by_address(vf['address']) + self.assertEqual('available', vf_dev.status) + + def test_claim_available_pf_while_children_vfs_are_in_mixed_state(self): + # We start with a PF parent and two VF children. The PF is available + # and one of the VF is unavailable while the other is available. + pf = copy.deepcopy(fake_db_dev_3) + vf1 = copy.deepcopy(fake_db_dev_4) + vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE + vf2 = copy.deepcopy(fake_db_dev_5) + vf2['status'] = fields.PciDeviceStatus.AVAILABLE + self._create_tracker([pf, vf1, vf2]) + + pf_dev = self._get_device_by_address(pf['address']) + self.assertEqual('available', pf_dev.status) + vf1_dev = self._get_device_by_address(vf1['address']) + self.assertEqual('unavailable', vf1_dev.status) + vf2_dev = self._get_device_by_address(vf2['address']) + self.assertEqual('available', vf2_dev.status) + + pci_requests_obj = self._create_pci_requests_object( + [ + { + 'count': 1, + 'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}] + } + ], + instance_uuid=uuidsentinel.instance1, + ) + # now try to claim and allocate the PF. It should work as it is + # available + self.tracker.claim_instance( + mock.sentinel.context, pci_requests_obj, None) + self.tracker.allocate_instance({'uuid': uuidsentinel.instance1}) + + pf_dev = self._get_device_by_address(pf['address']) + self.assertEqual('allocated', pf_dev.status) + vf1_dev = self._get_device_by_address(vf1['address']) + self.assertEqual('unavailable', vf1_dev.status) + vf2_dev = self._get_device_by_address(vf2['address']) + self.assertEqual('unavailable', vf2_dev.status) + + self.assertIn( + 'Some child device of parent 0000:00:01.1 is in an inconsistent ' + 'state. If you can reproduce this warning then please report a ' + 'bug at https://bugs.launchpad.net/nova/+filebug with ' + 'reproduction steps. Inconsistent children with state: ' + '0000:00:02.1 - unavailable', + self.stdlog.logger.output + ) + + # Ensure that the claim actually fixes the inconsistency so when the + # parent if freed the children become available too. + self.tracker.free_instance( + mock.sentinel.context, {'uuid': uuidsentinel.instance1}) + + pf_dev = self._get_device_by_address(pf['address']) + self.assertEqual('available', pf_dev.status) + vf1_dev = self._get_device_by_address(vf1['address']) + self.assertEqual('available', vf1_dev.status) + vf2_dev = self._get_device_by_address(vf2['address']) + self.assertEqual('available', vf2_dev.status) + + def test_claim_available_pf_while_a_child_is_used(self): + pf = copy.deepcopy(fake_db_dev_3) + vf1 = copy.deepcopy(fake_db_dev_4) + vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE + vf2 = copy.deepcopy(fake_db_dev_5) + vf2['status'] = fields.PciDeviceStatus.CLAIMED + self._create_tracker([pf, vf1, vf2]) + + pf_dev = self._get_device_by_address(pf['address']) + self.assertEqual('available', pf_dev.status) + vf1_dev = self._get_device_by_address(vf1['address']) + self.assertEqual('unavailable', vf1_dev.status) + vf2_dev = self._get_device_by_address(vf2['address']) + self.assertEqual('claimed', vf2_dev.status) + + pci_requests_obj = self._create_pci_requests_object( + [ + { + 'count': 1, + 'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}] + } + ], + instance_uuid=uuidsentinel.instance1, + ) + # now try to claim and allocate the PF. The claim should fail as on of + # the child is used. + self.assertRaises( + exception.PciDeviceVFInvalidStatus, + self.tracker.claim_instance, + mock.sentinel.context, + pci_requests_obj, + None, + ) + + pf_dev = self._get_device_by_address(pf['address']) + self.assertEqual('available', pf_dev.status) + vf1_dev = self._get_device_by_address(vf1['address']) + self.assertEqual('unavailable', vf1_dev.status) + vf2_dev = self._get_device_by_address(vf2['address']) + self.assertEqual('claimed', vf2_dev.status) def test_update_pci_for_instance_active(self): pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)