Invalidate provider tree when compute node disappears

There is a race condition in nova-compute with the ironic virt driver
as nodes get rebalanced. It can lead to compute nodes being removed in
the DB and not repopulated. Ultimately this prevents these nodes from
being scheduled to.

The issue being addressed here is that if a compute node is deleted by a
host which thinks it is an orphan, then the resource provider for that
node might also be deleted. The compute host that owns the node might
not recreate the resource provider if it exists in the provider tree
cache.

This change fixes the issue by clearing resource providers from the
provider tree cache for which a compute node entry does not exist. Then,
when the available resource for the node is updated, the resource
providers are not found in the cache and get recreated in placement.

Change-Id: Ia53ff43e6964963cdf295604ba0fb7171389606e
Related-Bug: #1853009
Related-Bug: #1841481
This commit is contained in:
Mark Goddard
2019-11-19 16:51:01 +00:00
committed by Lee Yarwood
parent 32676a9f45
commit 2bb4527228
4 changed files with 25 additions and 20 deletions

View File

@@ -1962,3 +1962,4 @@ class ResourceTracker(object):
# where another compute service took ownership of the node. Clean
# up the cache.
self.remove_node(stale_cn)
self.reportclient.invalidate_resource_provider(stale_cn)

View File

@@ -677,11 +677,7 @@ class SchedulerReportClient(object):
if resp:
LOG.info("Deleted resource provider %s", rp_uuid)
# clean the caches
try:
self._provider_tree.remove(rp_uuid)
except ValueError:
pass
self._association_refresh_time.pop(rp_uuid, None)
self.invalidate_resource_provider(rp_uuid)
return
msg = ("[%(placement_req_id)s] Failed to delete resource provider "
@@ -2266,6 +2262,17 @@ class SchedulerReportClient(object):
# left a no-op for backward compatibility.
pass
def invalidate_resource_provider(self, name_or_uuid):
"""Invalidate the cache for a resource provider.
:param name_or_uuid: Name or UUID of the resource provider to look up.
"""
try:
self._provider_tree.remove(name_or_uuid)
except ValueError:
pass
self._association_refresh_time.pop(name_or_uuid, None)
def get_provider_by_name(self, context, name):
"""Queries the placement API for resource provider information matching
a supplied name.

View File

@@ -153,9 +153,8 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
self.assertEqual(0, len(rps), rps)
# host_b[3]: Should recreate compute node and resource provider.
# FIXME(mgoddard): Resource provider not recreated here, because it
# exists in the provider tree. See
# https://bugs.launchpad.net/nova/+bug/1841481.
# FIXME(mgoddard): Resource provider not recreated here, due to
# https://bugs.launchpad.net/nova/+bug/1853159.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node was recreated.
@@ -170,14 +169,11 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
self.assertEqual(0, len(rps), rps)
# But the RP exists in the provider tree.
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
self.nodename))
# host_b[1]: Should add compute node to RT cache and recreate resource
# provider.
# FIXME(mgoddard): Resource provider not recreated here, because it
# exists in the provider tree. See
# https://bugs.launchpad.net/nova/+bug/1841481.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node still exists.
@@ -186,13 +182,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
# And it is now in the RT cache.
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
# There is still no RP.
# The resource provider has now been created.
rps = self._get_all_providers()
self.assertEqual(0, len(rps), rps)
# But the RP it exists in the provider tree.
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
self.nodename))
self.assertEqual(1, len(rps), rps)
self.assertEqual(self.nodename, rps[0]['name'])
# This fails due to the lack of a resource provider.
self.assertIn(

View File

@@ -4192,5 +4192,9 @@ class TestCleanComputeNodeCache(BaseTestCase):
invalid_nodename = "invalid-node"
self.rt.compute_nodes[_NODENAME] = self.compute
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
self.rt.clean_compute_node_cache([self.compute])
mock_remove.assert_called_once_with(invalid_nodename)
with mock.patch.object(
self.rt.reportclient, "invalidate_resource_provider",
) as mock_invalidate:
self.rt.clean_compute_node_cache([self.compute])
mock_remove.assert_called_once_with(invalid_nodename)
mock_invalidate.assert_called_once_with(invalid_nodename)