diff --git a/neutron/agent/common/ovs_lib.py b/neutron/agent/common/ovs_lib.py index 9fa098f25b9..c75ede74ce2 100644 --- a/neutron/agent/common/ovs_lib.py +++ b/neutron/agent/common/ovs_lib.py @@ -16,6 +16,7 @@ import collections import itertools import operator +import time import uuid from oslo_config import cfg @@ -255,13 +256,24 @@ class OVSBridge(BaseOVS): def run_ofctl(self, cmd, args, process_input=None): full_args = ["ovs-ofctl", cmd, self.br_name] + args - try: - return utils.execute(full_args, run_as_root=True, - process_input=process_input) - except Exception as e: - LOG.error(_LE("Unable to execute %(cmd)s. Exception: " - "%(exception)s"), - {'cmd': full_args, 'exception': e}) + # TODO(kevinbenton): This error handling is really brittle and only + # detects one specific type of failure. The callers of this need to + # be refactored to expect errors so we can re-raise and they can + # take appropriate action based on the type of error. + for i in range(1, 11): + try: + return utils.execute(full_args, run_as_root=True, + process_input=process_input) + except Exception as e: + if "failed to connect to socket" in str(e): + LOG.debug("Failed to connect to OVS. Retrying " + "in 1 second. Attempt: %s/10", i) + time.sleep(1) + continue + LOG.error(_LE("Unable to execute %(cmd)s. Exception: " + "%(exception)s"), + {'cmd': full_args, 'exception': e}) + break def count_flows(self): flow_list = self.run_ofctl("dump-flows", []).split("\n")[1:] diff --git a/neutron/tests/unit/agent/common/test_ovs_lib.py b/neutron/tests/unit/agent/common/test_ovs_lib.py index ec4189d0379..3d0c6139c5f 100644 --- a/neutron/tests/unit/agent/common/test_ovs_lib.py +++ b/neutron/tests/unit/agent/common/test_ovs_lib.py @@ -342,6 +342,21 @@ class OVS_Lib_Test(base.BaseTestCase): self.br.mod_flow, **params) + def test_run_ofctl_retry_on_socket_error(self): + err = RuntimeError('failed to connect to socket') + self.execute.side_effect = [err] * 5 + with mock.patch('time.sleep') as sleep: + self.br.run_ofctl('add-flows', []) + self.assertEqual(5, sleep.call_count) + self.assertEqual(6, self.execute.call_count) + # a regular exception fails right away + self.execute.side_effect = RuntimeError('garbage') + self.execute.reset_mock() + with mock.patch('time.sleep') as sleep: + self.br.run_ofctl('add-flows', []) + self.assertEqual(0, sleep.call_count) + self.assertEqual(1, self.execute.call_count) + def test_add_tunnel_port(self): pname = "tap99" local_ip = "1.1.1.1"