From: Stephen Ma Date: Wed, 29 May 2013 01:52:27 +0000 (-0700) Subject: L3 Agent restart causes network outage X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=125b51f6376f3ad7138100ee2d53b92eb402b7a0;p=openstack-build%2Fneutron-build.git L3 Agent restart causes network outage When a L3 agent controlling multiple qrouter namespaces restarts, it destroys all qrouter namespaces even if some of them are still in use. As a result, network traffic could be stopped on the VMs that use the networks associated with these namespaces. So what is needed is for the L3 agent to preserve those qrouter namespaces a L3 agent instance recognizes and to destroy those it does not know about. Closes-Bug: #1175695 Change-Id: Idae77886bd195d773878c3d212ccfd56269216fb --- diff --git a/neutron/agent/l3_agent.py b/neutron/agent/l3_agent.py index 1ca3b7604..cf69842ec 100644 --- a/neutron/agent/l3_agent.py +++ b/neutron/agent/l3_agent.py @@ -216,8 +216,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): self.updated_routers = set() self.removed_routers = set() self.sync_progress = False - if self.conf.use_namespaces: - self._destroy_router_namespaces(self.conf.router_id) + + self._delete_stale_namespaces = (self.conf.use_namespaces and + self.conf.router_delete_namespaces) self.rpc_loop = loopingcall.FixedIntervalLoopingCall( self._rpc_loop) @@ -240,28 +241,46 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): LOG.error(msg) raise SystemExit(msg) - def _destroy_router_namespaces(self, only_router_id=None): - """Destroy router namespaces on the host to eliminate all stale - linux devices, iptables rules, and namespaces. + def _cleanup_namespaces(self, routers): + """Destroy stale router namespaces on host when L3 agent restarts - If only_router_id is passed, only destroy single namespace, to allow - for multiple l3 agents on the same host, without stepping on each - other's toes on init. This only makes sense if only_router_id is set. + This routine is called when self._delete_stale_namespaces is True. + + The argument routers is the list of routers that are recorded in + the database as being hosted on this node. """ - root_ip = ip_lib.IPWrapper(self.root_helper) - for ns in root_ip.get_namespaces(self.root_helper): - if ns.startswith(NS_PREFIX): - router_id = ns[len(NS_PREFIX):] - if only_router_id and not only_router_id == router_id: - continue + try: + root_ip = ip_lib.IPWrapper(self.root_helper) + + host_namespaces = root_ip.get_namespaces(self.root_helper) + router_namespaces = set(ns for ns in host_namespaces + if ns.startswith(NS_PREFIX)) + ns_to_ignore = set(NS_PREFIX + r['id'] for r in routers) + ns_to_destroy = router_namespaces - ns_to_ignore + except RuntimeError: + LOG.exception(_('RuntimeError in obtaining router list ' + 'for namespace cleanup.')) + else: + self._destroy_stale_router_namespaces(ns_to_destroy) + + def _destroy_stale_router_namespaces(self, router_namespaces): + """Destroys the stale router namespaces - if self.conf.enable_metadata_proxy: - self._destroy_metadata_proxy(router_id, ns) + The argumenet router_namespaces is a list of stale router namespaces - try: - self._destroy_router_namespace(ns) - except Exception: - LOG.exception(_("Failed deleting namespace '%s'"), ns) + As some stale router namespaces may not be able to be deleted, only + one attempt will be made to delete them. + """ + for ns in router_namespaces: + if self.conf.enable_metadata_proxy: + self._destroy_metadata_proxy(ns[len(NS_PREFIX):], ns) + + try: + self._destroy_router_namespace(ns) + except RuntimeError: + LOG.exception(_('Failed to destroy stale router namespace ' + '%s'), ns) + self._delete_stale_namespaces = False def _destroy_router_namespace(self, namespace): ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace) @@ -759,10 +778,19 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): self._process_routers(routers, all_routers=True) self.fullsync = False LOG.debug(_("_sync_routers_task successfully completed")) + except rpc_common.RPCException: + LOG.exception(_("Failed synchronizing routers due to RPC error")) + self.fullsync = True + return except Exception: LOG.exception(_("Failed synchronizing routers")) self.fullsync = True + # Resync is not necessary for the cleanup of stale + # namespaces. + if self._delete_stale_namespaces: + self._cleanup_namespaces(routers) + def after_start(self): LOG.info(_("L3 agent started")) diff --git a/neutron/tests/unit/test_l3_agent.py b/neutron/tests/unit/test_l3_agent.py index 17bd83da1..07db8d2c8 100644 --- a/neutron/tests/unit/test_l3_agent.py +++ b/neutron/tests/unit/test_l3_agent.py @@ -732,55 +732,6 @@ class TestBasicRouterOperations(base.BaseTestCase): agent._process_router_delete() self.assertFalse(list(agent.removed_routers)) - def test_destroy_namespace(self): - - class FakeDev(object): - def __init__(self, name): - self.name = name - - self.mock_ip.get_namespaces.return_value = ['qrouter-foo', - 'qrouter-bar'] - self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'), - FakeDev('qgw-aaaa')] - - agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) - - pm = self.external_process.return_value - pm.reset_mock() - - agent._destroy_router_namespace = mock.MagicMock() - agent._destroy_router_namespaces() - - self.assertEqual(pm.disable.call_count, 2) - - self.assertEqual(agent._destroy_router_namespace.call_count, 2) - - def test_destroy_namespace_with_router_id(self): - - class FakeDev(object): - def __init__(self, name): - self.name = name - - self.conf.router_id = _uuid() - - namespaces = ['qrouter-foo', 'qrouter-' + self.conf.router_id] - - self.mock_ip.get_namespaces.return_value = namespaces - self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'), - FakeDev('qgw-aaaa')] - - agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) - - pm = self.external_process.return_value - pm.reset_mock() - - agent._destroy_router_namespace = mock.MagicMock() - agent._destroy_router_namespaces(self.conf.router_id) - - self.assertEqual(pm.disable.call_count, 1) - - self.assertEqual(agent._destroy_router_namespace.call_count, 1) - def test_destroy_router_namespace_skips_ns_removal(self): agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent._destroy_router_namespace("fakens") @@ -842,6 +793,7 @@ class TestBasicRouterOperations(base.BaseTestCase): self.conf.set_override('router_id', '1234') agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) self.assertEqual(['1234'], agent._router_ids()) + self.assertFalse(agent._delete_stale_namespaces) def test_process_routers_with_no_ext_net_in_conf(self): agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) @@ -940,6 +892,69 @@ class TestBasicRouterOperations(base.BaseTestCase): '-p tcp -m tcp --dport 8775 -j ACCEPT') self.assertEqual([rules], agent.metadata_filter_rules()) + def _cleanup_namespace_test(self, + stale_namespace_list, + router_list, + other_namespaces): + self.conf.set_override('router_delete_namespaces', True) + + good_namespace_list = [l3_agent.NS_PREFIX + r['id'] + for r in router_list] + self.mock_ip.get_namespaces.return_value = (stale_namespace_list + + good_namespace_list + + other_namespaces) + + agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) + + self.assertTrue(agent._delete_stale_namespaces) + + pm = self.external_process.return_value + pm.reset_mock() + + agent._destroy_router_namespace = mock.MagicMock() + agent._cleanup_namespaces(router_list) + + self.assertEqual(pm.disable.call_count, len(stale_namespace_list)) + self.assertEqual(agent._destroy_router_namespace.call_count, + len(stale_namespace_list)) + expected_args = [mock.call(ns) for ns in stale_namespace_list] + agent._destroy_router_namespace.assert_has_calls(expected_args, + any_order=True) + self.assertFalse(agent._delete_stale_namespaces) + + def test_cleanup_namespace(self): + self.conf.set_override('router_id', None) + stale_namespaces = [l3_agent.NS_PREFIX + 'foo', + l3_agent.NS_PREFIX + 'bar'] + other_namespaces = ['unknown'] + + self._cleanup_namespace_test(stale_namespaces, + [], + other_namespaces) + + def test_cleanup_namespace_with_registered_router_ids(self): + self.conf.set_override('router_id', None) + stale_namespaces = [l3_agent.NS_PREFIX + 'cccc', + l3_agent.NS_PREFIX + 'eeeee'] + router_list = [{'id': 'foo'}, {'id': 'aaaa'}] + other_namespaces = ['qdhcp-aabbcc', 'unknown'] + + self._cleanup_namespace_test(stale_namespaces, + router_list, + other_namespaces) + + def test_cleanup_namespace_with_conf_router_id(self): + self.conf.set_override('router_id', 'bbbbb') + stale_namespaces = [l3_agent.NS_PREFIX + 'cccc', + l3_agent.NS_PREFIX + 'eeeee', + l3_agent.NS_PREFIX + self.conf.router_id] + router_list = [{'id': 'foo'}, {'id': 'aaaa'}] + other_namespaces = ['qdhcp-aabbcc', 'unknown'] + + self._cleanup_namespace_test(stale_namespaces, + router_list, + other_namespaces) + class TestL3AgentEventHandler(base.BaseTestCase):