diff --git a/.zuul.yaml b/.zuul.yaml index b6a8b04fc97f..07277e065b2d 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -210,6 +210,9 @@ $NEUTRON_CONF: nova: live_migration_events: True + $NOVA_CPU_CONF: + compute: + heal_instance_info_cache_interval: 60 group-vars: subnode: devstack_localrc: @@ -250,6 +253,9 @@ $NEUTRON_CONF: nova: live_migration_events: True + $NOVA_CPU_CONF: + compute: + heal_instance_info_cache_interval: 60 post-run: playbooks/nova-live-migration/post-run.yaml - job: @@ -422,10 +428,6 @@ # reduce the number of placement calls in steady state. Added in # Stein. resource_provider_association_refresh: 0 - # Neutron networking backends today are expected to work without - # the periodic healing of the cache in Nova. Turn it off to gain - # additional performance. - heal_instance_info_cache_interval: -1 workarounds: # This wa is an improvement on hard reboot that cannot be turned # on unconditionally. But we know that ml2/ovs sends plug time diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 3a5b3f5a7315..c8b143b5dd44 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -1085,7 +1085,7 @@ Related options: to be synchronized manually. """), cfg.IntOpt('heal_instance_info_cache_interval', - default=60, + default=-1, help=""" Interval between instance network information cache updates. diff --git a/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml b/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml new file mode 100644 index 000000000000..402f5e9fcada --- /dev/null +++ b/releasenotes/notes/disable_heal_instance_info_cache_interval-0d9ae7c12793bf7b.yaml @@ -0,0 +1,43 @@ +--- +upgrade: + - | + ``[compute]heal_instance_info_cache_interval`` now defaults to -1. + + In the early days of Nova, all networking was internal, then ``quantum``, + now known as ``neutron`` was introduced. + When the networking subsystem was being externalized and neutron was + optional Nova still needed to keep track of the ports associated with an + instance. + To that end, to avoid these expensive calls to an optional service the + instance info cache was extended to include network information and a + periodic task was introduced to update it in + ``08fa534a0d28fa1be48aef927584161becb936c7`` as part of the + ``Essex`` release. + + As we have learned over the years per compute periodic tasks that call + other services do not scale well as the number of compute nodes increases. + In ``ce936ea5f3ae0b4d3b816a7fe42d5f0100b20fca`` the os-server-external-events + API was introduced. The server external events API allows external systems + such as Neutron to trigger cache refreshes on demand, this was part + of the Icehouse release. With the introduction of this API, neutron was + modified to send network-changed events on a per-port basis as API actions + are performed on neutron ports. When that was introduced the default value + of ``[compute]heal_instance_info_cache_interval`` was not changed + to ensure there was no upgrade impact. + + In``ba44c155ce1dcefede9741722a0525820d6da2b8`` as part of bug #1751923 + the _heal_instance_info_cache periodic task was modified to pass a + "force_refresh" forcing Nova to lookup the current state of all ports for + the instance from neutron and fully rebuild the info_cache. This has the + side effect of making the already poor scaling of this optional periodic + task even worse. + + In this release, the default behaviour of Nova has been changed to + disable the periodic, optimizing for performance, scale, power consumption + and typical deployment topologies, where the instance network information + is updated by neutron via the external event API as ports are modified. + This should significantly reduce the background neutron API load in + medium to large clouds. If you have a neutron backend that does not + reliably send network-changed event notifications to Nova you can + re-enable this periodic task by setting + ``[compute]heal_instance_info_cache_interval`` to a value greater than 0. \ No newline at end of file