]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
L3 Agent restart causes network outage
authorStephen Ma <stephen.ma@hp.com>
Wed, 29 May 2013 01:52:27 +0000 (18:52 -0700)
committerThomas Goirand <thomas@goirand.fr>
Thu, 13 Mar 2014 07:20:12 +0000 (15:20 +0800)
When a L3 agent controlling multiple qrouter namespaces
restarts, it destroys all qrouter namespaces even if
some of them are still in use.  As a result, network
traffic could be stopped on the VMs that use the
networks associated with these namespaces.

So what is needed is for the L3 agent to preserve those
qrouter namespaces a L3 agent instance recognizes and to
destroy those it does not know about.

Closes-Bug: #1175695

Change-Id: Idae77886bd195d773878c3d212ccfd56269216fb

neutron/agent/l3_agent.py
neutron/tests/unit/test_l3_agent.py

index 1ca3b7604e8e77471c24e37eda19b789ff3534e5..cf69842ec17cd7ab3d750c92a1b792b81070bc08 100644 (file)
@@ -216,8 +216,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
         self.updated_routers = set()
         self.removed_routers = set()
         self.sync_progress = False
-        if self.conf.use_namespaces:
-            self._destroy_router_namespaces(self.conf.router_id)
+
+        self._delete_stale_namespaces = (self.conf.use_namespaces and
+                                         self.conf.router_delete_namespaces)
 
         self.rpc_loop = loopingcall.FixedIntervalLoopingCall(
             self._rpc_loop)
@@ -240,28 +241,46 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
             LOG.error(msg)
             raise SystemExit(msg)
 
-    def _destroy_router_namespaces(self, only_router_id=None):
-        """Destroy router namespaces on the host to eliminate all stale
-        linux devices, iptables rules, and namespaces.
+    def _cleanup_namespaces(self, routers):
+        """Destroy stale router namespaces on host when L3 agent restarts
 
-        If only_router_id is passed, only destroy single namespace, to allow
-        for multiple l3 agents on the same host, without stepping on each
-        other's toes on init.  This only makes sense if only_router_id is set.
+        This routine is called when self._delete_stale_namespaces is True.
+
+        The argument routers is the list of routers that are recorded in
+        the database as being hosted on this node.
         """
-        root_ip = ip_lib.IPWrapper(self.root_helper)
-        for ns in root_ip.get_namespaces(self.root_helper):
-            if ns.startswith(NS_PREFIX):
-                router_id = ns[len(NS_PREFIX):]
-                if only_router_id and not only_router_id == router_id:
-                    continue
+        try:
+            root_ip = ip_lib.IPWrapper(self.root_helper)
+
+            host_namespaces = root_ip.get_namespaces(self.root_helper)
+            router_namespaces = set(ns for ns in host_namespaces
+                                    if ns.startswith(NS_PREFIX))
+            ns_to_ignore = set(NS_PREFIX + r['id'] for r in routers)
+            ns_to_destroy = router_namespaces - ns_to_ignore
+        except RuntimeError:
+            LOG.exception(_('RuntimeError in obtaining router list '
+                            'for namespace cleanup.'))
+        else:
+            self._destroy_stale_router_namespaces(ns_to_destroy)
+
+    def _destroy_stale_router_namespaces(self, router_namespaces):
+        """Destroys the stale router namespaces
 
-                if self.conf.enable_metadata_proxy:
-                    self._destroy_metadata_proxy(router_id, ns)
+        The argumenet router_namespaces is a list of stale router namespaces
 
-                try:
-                    self._destroy_router_namespace(ns)
-                except Exception:
-                    LOG.exception(_("Failed deleting namespace '%s'"), ns)
+        As some stale router namespaces may not be able to be deleted, only
+        one attempt will be made to delete them.
+        """
+        for ns in router_namespaces:
+            if self.conf.enable_metadata_proxy:
+                self._destroy_metadata_proxy(ns[len(NS_PREFIX):], ns)
+
+            try:
+                self._destroy_router_namespace(ns)
+            except RuntimeError:
+                LOG.exception(_('Failed to destroy stale router namespace '
+                                '%s'), ns)
+        self._delete_stale_namespaces = False
 
     def _destroy_router_namespace(self, namespace):
         ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace)
@@ -759,10 +778,19 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
             self._process_routers(routers, all_routers=True)
             self.fullsync = False
             LOG.debug(_("_sync_routers_task successfully completed"))
+        except rpc_common.RPCException:
+            LOG.exception(_("Failed synchronizing routers due to RPC error"))
+            self.fullsync = True
+            return
         except Exception:
             LOG.exception(_("Failed synchronizing routers"))
             self.fullsync = True
 
+        # Resync is not necessary for the cleanup of stale
+        # namespaces.
+        if self._delete_stale_namespaces:
+            self._cleanup_namespaces(routers)
+
     def after_start(self):
         LOG.info(_("L3 agent started"))
 
index 17bd83da1b15a253183fed8a99677ac7547c4058..07db8d2c8feebd7b7f3fcabc114a77aec821fddf 100644 (file)
@@ -732,55 +732,6 @@ class TestBasicRouterOperations(base.BaseTestCase):
         agent._process_router_delete()
         self.assertFalse(list(agent.removed_routers))
 
-    def test_destroy_namespace(self):
-
-        class FakeDev(object):
-            def __init__(self, name):
-                self.name = name
-
-        self.mock_ip.get_namespaces.return_value = ['qrouter-foo',
-                                                    'qrouter-bar']
-        self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
-                                                 FakeDev('qgw-aaaa')]
-
-        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
-
-        pm = self.external_process.return_value
-        pm.reset_mock()
-
-        agent._destroy_router_namespace = mock.MagicMock()
-        agent._destroy_router_namespaces()
-
-        self.assertEqual(pm.disable.call_count, 2)
-
-        self.assertEqual(agent._destroy_router_namespace.call_count, 2)
-
-    def test_destroy_namespace_with_router_id(self):
-
-        class FakeDev(object):
-            def __init__(self, name):
-                self.name = name
-
-        self.conf.router_id = _uuid()
-
-        namespaces = ['qrouter-foo', 'qrouter-' + self.conf.router_id]
-
-        self.mock_ip.get_namespaces.return_value = namespaces
-        self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
-                                                 FakeDev('qgw-aaaa')]
-
-        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
-
-        pm = self.external_process.return_value
-        pm.reset_mock()
-
-        agent._destroy_router_namespace = mock.MagicMock()
-        agent._destroy_router_namespaces(self.conf.router_id)
-
-        self.assertEqual(pm.disable.call_count, 1)
-
-        self.assertEqual(agent._destroy_router_namespace.call_count, 1)
-
     def test_destroy_router_namespace_skips_ns_removal(self):
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
         agent._destroy_router_namespace("fakens")
@@ -842,6 +793,7 @@ class TestBasicRouterOperations(base.BaseTestCase):
         self.conf.set_override('router_id', '1234')
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
         self.assertEqual(['1234'], agent._router_ids())
+        self.assertFalse(agent._delete_stale_namespaces)
 
     def test_process_routers_with_no_ext_net_in_conf(self):
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
@@ -940,6 +892,69 @@ class TestBasicRouterOperations(base.BaseTestCase):
                  '-p tcp -m tcp --dport 8775 -j ACCEPT')
         self.assertEqual([rules], agent.metadata_filter_rules())
 
+    def _cleanup_namespace_test(self,
+                                stale_namespace_list,
+                                router_list,
+                                other_namespaces):
+        self.conf.set_override('router_delete_namespaces', True)
+
+        good_namespace_list = [l3_agent.NS_PREFIX + r['id']
+                               for r in router_list]
+        self.mock_ip.get_namespaces.return_value = (stale_namespace_list +
+                                                    good_namespace_list +
+                                                    other_namespaces)
+
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
+
+        self.assertTrue(agent._delete_stale_namespaces)
+
+        pm = self.external_process.return_value
+        pm.reset_mock()
+
+        agent._destroy_router_namespace = mock.MagicMock()
+        agent._cleanup_namespaces(router_list)
+
+        self.assertEqual(pm.disable.call_count, len(stale_namespace_list))
+        self.assertEqual(agent._destroy_router_namespace.call_count,
+                         len(stale_namespace_list))
+        expected_args = [mock.call(ns) for ns in stale_namespace_list]
+        agent._destroy_router_namespace.assert_has_calls(expected_args,
+                                                         any_order=True)
+        self.assertFalse(agent._delete_stale_namespaces)
+
+    def test_cleanup_namespace(self):
+        self.conf.set_override('router_id', None)
+        stale_namespaces = [l3_agent.NS_PREFIX + 'foo',
+                            l3_agent.NS_PREFIX + 'bar']
+        other_namespaces = ['unknown']
+
+        self._cleanup_namespace_test(stale_namespaces,
+                                     [],
+                                     other_namespaces)
+
+    def test_cleanup_namespace_with_registered_router_ids(self):
+        self.conf.set_override('router_id', None)
+        stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
+                            l3_agent.NS_PREFIX + 'eeeee']
+        router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
+        other_namespaces = ['qdhcp-aabbcc', 'unknown']
+
+        self._cleanup_namespace_test(stale_namespaces,
+                                     router_list,
+                                     other_namespaces)
+
+    def test_cleanup_namespace_with_conf_router_id(self):
+        self.conf.set_override('router_id', 'bbbbb')
+        stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
+                            l3_agent.NS_PREFIX + 'eeeee',
+                            l3_agent.NS_PREFIX + self.conf.router_id]
+        router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
+        other_namespaces = ['qdhcp-aabbcc', 'unknown']
+
+        self._cleanup_namespace_test(stale_namespaces,
+                                     router_list,
+                                     other_namespaces)
+
 
 class TestL3AgentEventHandler(base.BaseTestCase):