Invalidate provider tree when compute node disappears
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the resource provider for that node might also be deleted. The compute host that owns the node might not recreate the resource provider if it exists in the provider tree cache. This change fixes the issue by clearing resource providers from the provider tree cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the resource providers are not found in the cache and get recreated in placement. Change-Id: Ia53ff43e6964963cdf295604ba0fb7171389606e Related-Bug: #1853009 Related-Bug: #1841481
This commit is contained in:

committed by
Lee Yarwood

parent
32676a9f45
commit
2bb4527228
@@ -1962,3 +1962,4 @@ class ResourceTracker(object):
|
||||
# where another compute service took ownership of the node. Clean
|
||||
# up the cache.
|
||||
self.remove_node(stale_cn)
|
||||
self.reportclient.invalidate_resource_provider(stale_cn)
|
||||
|
@@ -677,11 +677,7 @@ class SchedulerReportClient(object):
|
||||
if resp:
|
||||
LOG.info("Deleted resource provider %s", rp_uuid)
|
||||
# clean the caches
|
||||
try:
|
||||
self._provider_tree.remove(rp_uuid)
|
||||
except ValueError:
|
||||
pass
|
||||
self._association_refresh_time.pop(rp_uuid, None)
|
||||
self.invalidate_resource_provider(rp_uuid)
|
||||
return
|
||||
|
||||
msg = ("[%(placement_req_id)s] Failed to delete resource provider "
|
||||
@@ -2266,6 +2262,17 @@ class SchedulerReportClient(object):
|
||||
# left a no-op for backward compatibility.
|
||||
pass
|
||||
|
||||
def invalidate_resource_provider(self, name_or_uuid):
|
||||
"""Invalidate the cache for a resource provider.
|
||||
|
||||
:param name_or_uuid: Name or UUID of the resource provider to look up.
|
||||
"""
|
||||
try:
|
||||
self._provider_tree.remove(name_or_uuid)
|
||||
except ValueError:
|
||||
pass
|
||||
self._association_refresh_time.pop(name_or_uuid, None)
|
||||
|
||||
def get_provider_by_name(self, context, name):
|
||||
"""Queries the placement API for resource provider information matching
|
||||
a supplied name.
|
||||
|
@@ -153,9 +153,8 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
self.assertEqual(0, len(rps), rps)
|
||||
|
||||
# host_b[3]: Should recreate compute node and resource provider.
|
||||
# FIXME(mgoddard): Resource provider not recreated here, because it
|
||||
# exists in the provider tree. See
|
||||
# https://bugs.launchpad.net/nova/+bug/1841481.
|
||||
# FIXME(mgoddard): Resource provider not recreated here, due to
|
||||
# https://bugs.launchpad.net/nova/+bug/1853159.
|
||||
host_b.manager.update_available_resource(self.ctxt)
|
||||
|
||||
# Verify that the node was recreated.
|
||||
@@ -170,14 +169,11 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
self.assertEqual(0, len(rps), rps)
|
||||
|
||||
# But the RP exists in the provider tree.
|
||||
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
|
||||
self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
|
||||
self.nodename))
|
||||
|
||||
# host_b[1]: Should add compute node to RT cache and recreate resource
|
||||
# provider.
|
||||
# FIXME(mgoddard): Resource provider not recreated here, because it
|
||||
# exists in the provider tree. See
|
||||
# https://bugs.launchpad.net/nova/+bug/1841481.
|
||||
host_b.manager.update_available_resource(self.ctxt)
|
||||
|
||||
# Verify that the node still exists.
|
||||
@@ -186,13 +182,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
# And it is now in the RT cache.
|
||||
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
|
||||
|
||||
# There is still no RP.
|
||||
# The resource provider has now been created.
|
||||
rps = self._get_all_providers()
|
||||
self.assertEqual(0, len(rps), rps)
|
||||
|
||||
# But the RP it exists in the provider tree.
|
||||
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
|
||||
self.nodename))
|
||||
self.assertEqual(1, len(rps), rps)
|
||||
self.assertEqual(self.nodename, rps[0]['name'])
|
||||
|
||||
# This fails due to the lack of a resource provider.
|
||||
self.assertIn(
|
||||
|
@@ -4192,5 +4192,9 @@ class TestCleanComputeNodeCache(BaseTestCase):
|
||||
invalid_nodename = "invalid-node"
|
||||
self.rt.compute_nodes[_NODENAME] = self.compute
|
||||
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
|
||||
self.rt.clean_compute_node_cache([self.compute])
|
||||
mock_remove.assert_called_once_with(invalid_nodename)
|
||||
with mock.patch.object(
|
||||
self.rt.reportclient, "invalidate_resource_provider",
|
||||
) as mock_invalidate:
|
||||
self.rt.clean_compute_node_cache([self.compute])
|
||||
mock_remove.assert_called_once_with(invalid_nodename)
|
||||
mock_invalidate.assert_called_once_with(invalid_nodename)
|
||||
|
Reference in New Issue
Block a user