Compute restart causes period of network 'blackout'

Fixes bug 1034401

When a compute service is restarted each instance running on the
host has its iptables rules built and applied sequentially during
the host init stage. The impact of this, especially on a host
running many instances, can be observed as a period where some
instances are not accessible as the existing iptables rules have
been torn down and not yet re-applied.

The presented work-around for this is a configurable/flagged deferred
mode that prevents the application of the iptables rules until all
instances on the host had been initialised then the rules for all
instances are applied all at once preventing a 'blackout' period.

Change-Id: I0da90d07e54225fb63f3884897fb00a6027cd537
This commit is contained in:
David McNally
2012-08-08 16:20:23 +01:00
parent 043e3f5981
commit 8f1c54ce98
7 changed files with 117 additions and 32 deletions

View File

@@ -275,43 +275,55 @@ class ComputeManager(manager.SchedulerDependentManager):
self.driver.init_host(host=self.host)
context = nova.context.get_admin_context()
instances = self.db.instance_get_all_by_host(context, self.host)
for count, instance in enumerate(instances):
db_state = instance['power_state']
drv_state = self._get_power_state(context, instance)
expect_running = (db_state == power_state.RUNNING and
drv_state != db_state)
if FLAGS.defer_iptables_apply:
self.driver.filter_defer_apply_on()
LOG.debug(_('Current state is %(drv_state)s, state in DB is '
'%(db_state)s.'), locals(), instance=instance)
try:
for count, instance in enumerate(instances):
db_state = instance['power_state']
drv_state = self._get_power_state(context, instance)
net_info = compute_utils.get_nw_info_for_instance(instance)
expect_running = (db_state == power_state.RUNNING and
drv_state != db_state)
# We're calling plug_vifs to ensure bridge and iptables
# filters are present, calling it once is enough.
if count == 0:
legacy_net_info = self._legacy_nw_info(net_info)
self.driver.plug_vifs(instance, legacy_net_info)
LOG.debug(_('Current state is %(drv_state)s, state in DB is '
'%(db_state)s.'), locals(), instance=instance)
if ((expect_running and FLAGS.resume_guests_state_on_host_boot) or
FLAGS.start_guests_on_host_boot):
LOG.info(_('Rebooting instance after nova-compute restart.'),
locals(), instance=instance)
try:
self.driver.resume_state_on_host_boot(context, instance,
self._legacy_nw_info(net_info))
except NotImplementedError:
LOG.warning(_('Hypervisor driver does not support '
'resume guests'), instance=instance)
net_info = compute_utils.get_nw_info_for_instance(instance)
elif drv_state == power_state.RUNNING:
# VMWareAPI drivers will raise an exception
try:
self.driver.ensure_filtering_rules_for_instance(instance,
self._legacy_nw_info(net_info))
except NotImplementedError:
LOG.warning(_('Hypervisor driver does not support '
'firewall rules'), instance=instance)
# We're calling plug_vifs to ensure bridge and iptables
# filters are present, calling it once is enough.
if count == 0:
legacy_net_info = self._legacy_nw_info(net_info)
self.driver.plug_vifs(instance, legacy_net_info)
if ((expect_running and FLAGS.resume_guests_state_on_host_boot)
or FLAGS.start_guests_on_host_boot):
LOG.info(
_('Rebooting instance after nova-compute restart.'),
locals(), instance=instance)
try:
self.driver.resume_state_on_host_boot(context,
instance,
self._legacy_nw_info(net_info))
except NotImplementedError:
LOG.warning(_('Hypervisor driver does not support '
'resume guests'), instance=instance)
elif drv_state == power_state.RUNNING:
# VMWareAPI drivers will raise an exception
try:
self.driver.ensure_filtering_rules_for_instance(
instance,
self._legacy_nw_info(net_info))
except NotImplementedError:
LOG.warning(_('Hypervisor driver does not support '
'firewall rules'), instance=instance)
finally:
if FLAGS.defer_iptables_apply:
self.driver.filter_defer_apply_off()
def _get_power_state(self, context, instance):
"""Retrieve the power state for the given instance."""

View File

@@ -428,6 +428,11 @@ global_opts = [
'min_disk'],
help='These are image properties which a snapshot should not'
' inherit from an instance'),
cfg.BoolOpt('defer_iptables_apply',
default=False,
help='Whether to batch up the application of IPTables rules'
' during a host restart and apply all at the end of the'
' init phase'),
]
FLAGS.register_opts(global_opts)

View File

@@ -263,6 +263,8 @@ class IptablesManager(object):
'nat': IptablesTable()}
self.ipv6 = {'filter': IptablesTable()}
self.iptables_apply_deferred = False
# Add a nova-filter-top chain. It's intended to be shared
# among the various nova components. It sits at the very top
# of FORWARD and OUTPUT.
@@ -312,8 +314,21 @@ class IptablesManager(object):
self.ipv4['nat'].add_chain('float-snat')
self.ipv4['nat'].add_rule('snat', '-j $float-snat')
@utils.synchronized('iptables', external=True)
def defer_apply_on(self):
self.iptables_apply_deferred = True
def defer_apply_off(self):
self.iptables_apply_deferred = False
self._apply()
def apply(self):
if self.iptables_apply_deferred:
return
self._apply()
@utils.synchronized('iptables', external=True)
def _apply(self):
"""Apply the current in-memory set of iptables rules.
This will blow away any rules left over from previous runs of the

View File

@@ -508,3 +508,28 @@ class LinuxNetworkTestCase(test.TestCase):
'2001:db8::/64', 'dev', 'eth0'),
]
self._test_initialize_gateway(existing, expected)
def test_apply_ran(self):
manager = linux_net.IptablesManager()
manager.iptables_apply_deferred = False
self.mox.StubOutWithMock(manager, '_apply')
manager._apply()
self.mox.ReplayAll()
empty_ret = manager.apply()
self.assertEqual(empty_ret, None)
def test_apply_not_run(self):
manager = linux_net.IptablesManager()
manager.iptables_apply_deferred = True
self.mox.StubOutWithMock(manager, '_apply')
self.mox.ReplayAll()
manager.apply()
def test_deferred_unset_apply_ran(self):
manager = linux_net.IptablesManager()
manager.iptables_apply_deferred = True
self.mox.StubOutWithMock(manager, '_apply')
manager._apply()
self.mox.ReplayAll()
manager.defer_apply_off()
self.assertFalse(manager.iptables_apply_deferred)

View File

@@ -488,6 +488,14 @@ class ComputeDriver(object):
# TODO(Vek): Need to pass context in for access to auth_token
raise NotImplementedError()
def filter_defer_apply_on(self):
"""Defer application of IPTables rules"""
pass
def filter_defer_apply_off(self):
"""Turn off deferral of IPTables rules and apply the rules now"""
pass
def unfilter_instance(self, instance, network_info):
"""Stop filtering instance"""
# TODO(Vek): Need to pass context in for access to auth_token

View File

@@ -47,6 +47,14 @@ class FirewallDriver(object):
At this point, the instance isn't running yet."""
raise NotImplementedError()
def filter_defer_apply_on(self):
"""Defer application of IPTables rules"""
pass
def filter_defer_apply_off(self):
"""Turn off deferral of IPTables rules and apply the rules now"""
pass
def unfilter_instance(self, instance, network_info):
"""Stop filtering instance"""
raise NotImplementedError()
@@ -128,6 +136,12 @@ class IptablesFirewallDriver(FirewallDriver):
"""No-op. Everything is done in prepare_instance_filter."""
pass
def filter_defer_apply_on(self):
self.iptables.defer_apply_on()
def filter_defer_apply_off(self):
self.iptables.defer_apply_off()
def unfilter_instance(self, instance, network_info):
# make sure this is legacy nw_info
network_info = self._handle_network_info_model(network_info)

View File

@@ -2435,6 +2435,12 @@ class LibvirtDriver(driver.ComputeDriver):
raise exception.NovaException(msg % instance_ref["name"])
time.sleep(1)
def filter_defer_apply_on(self):
self.firewall_driver.filter_defer_apply_on()
def filter_defer_apply_off(self):
self.firewall_driver.filter_defer_apply_off()
def live_migration(self, ctxt, instance_ref, dest,
post_method, recover_method, block_migration=False):
"""Spawning live_migration operation for distributing high-load.