From ae8c1c5f80fd4fb7b4ab116677f4cff988c67cf1 Mon Sep 17 00:00:00 2001 From: Eugene Nikanorov Date: Tue, 26 May 2015 20:17:20 +0400 Subject: [PATCH] Catch broad exception in methods used in FixedIntervalLoopingCall Unlike other places where it might make sense to catch specific exceptions, methods that are used to check L3 and DHCP agents liveness via FixedIntervalLoopingCall should never allow exceptions to leak to calling method and interrupt the loop. Further improvement of FixedIntervalLoopingCall might be needed, but for the sake of easy backporting it makes sense to fix the issue in neutron before pushing refactoring to 3rd-party library. Change-Id: I6a61e99a6f4e445e26ea4a9923b47e35559e5703 Closes-Bug: #1458119 --- neutron/db/agentschedulers_db.py | 73 ++++++++++--------- neutron/db/l3_agentschedulers_db.py | 6 +- .../openvswitch/test_agent_scheduler.py | 11 +-- .../scheduler/test_dhcp_agent_scheduler.py | 9 +++ 4 files changed, 56 insertions(+), 43 deletions(-) diff --git a/neutron/db/agentschedulers_db.py b/neutron/db/agentschedulers_db.py index bac33a78f..61eff9b07 100644 --- a/neutron/db/agentschedulers_db.py +++ b/neutron/db/agentschedulers_db.py @@ -270,40 +270,47 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler agents_db.Agent.admin_state_up)) dhcp_notifier = self.agent_notifiers.get(constants.AGENT_TYPE_DHCP) - for binding in self._filter_bindings(context, down_bindings): - LOG.warn(_LW("Removing network %(network)s from agent %(agent)s " - "because the agent did not report to the server in " - "the last %(dead_time)s seconds."), - {'network': binding.network_id, - 'agent': binding.dhcp_agent_id, - 'dead_time': agent_dead_limit}) - # save binding object to avoid ObjectDeletedError - # in case binding is concurrently deleted from the DB - saved_binding = {'net': binding.network_id, - 'agent': binding.dhcp_agent_id} - try: - # do not notify agent if it considered dead - # so when it is restarted it won't see network delete - # notifications on its queue - self.remove_network_from_dhcp_agent(context, - binding.dhcp_agent_id, - binding.network_id, - notify=False) - except dhcpagentscheduler.NetworkNotHostedByDhcpAgent: - # measures against concurrent operation - LOG.debug("Network %(net)s already removed from DHCP agent " - "%(agent)s", - saved_binding) - # still continue and allow concurrent scheduling attempt - except Exception: - LOG.exception(_LE("Unexpected exception occurred while " - "removing network %(net)s from agent " - "%(agent)s"), + try: + for binding in self._filter_bindings(context, down_bindings): + LOG.warn(_LW("Removing network %(network)s from agent " + "%(agent)s because the agent did not report " + "to the server in the last %(dead_time)s " + "seconds."), + {'network': binding.network_id, + 'agent': binding.dhcp_agent_id, + 'dead_time': agent_dead_limit}) + # save binding object to avoid ObjectDeletedError + # in case binding is concurrently deleted from the DB + saved_binding = {'net': binding.network_id, + 'agent': binding.dhcp_agent_id} + try: + # do not notify agent if it considered dead + # so when it is restarted it won't see network delete + # notifications on its queue + self.remove_network_from_dhcp_agent(context, + binding.dhcp_agent_id, + binding.network_id, + notify=False) + except dhcpagentscheduler.NetworkNotHostedByDhcpAgent: + # measures against concurrent operation + LOG.debug("Network %(net)s already removed from DHCP " + "agent %(agent)s", saved_binding) - - if cfg.CONF.network_auto_schedule: - self._schedule_network( - context, saved_binding['net'], dhcp_notifier) + # still continue and allow concurrent scheduling attempt + except Exception: + LOG.exception(_LE("Unexpected exception occurred while " + "removing network %(net)s from agent " + "%(agent)s"), + saved_binding) + + if cfg.CONF.network_auto_schedule: + self._schedule_network( + context, saved_binding['net'], dhcp_notifier) + except Exception: + # we want to be thorough and catch whatever is raised + # to avoid loop abortion + LOG.exception(_LE("Exception encountered during network " + "rescheduling")) def get_dhcp_agents_hosting_networks( self, context, network_ids, active=None, admin_state_up=None): diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index f661dcc62..59c397718 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -116,9 +116,9 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, # so one broken one doesn't stop the iteration. LOG.exception(_LE("Failed to reschedule router %s"), binding.router_id) - except db_exc.DBError: - # Catch DB errors here so a transient DB connectivity issue - # doesn't stop the loopingcall. + except Exception: + # we want to be thorough and catch whatever is raised + # to avoid loop abortion LOG.exception(_LE("Exception encountered during router " "rescheduling.")) diff --git a/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py b/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py index 2682014d0..5f14037fd 100644 --- a/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py +++ b/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py @@ -674,17 +674,14 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase): db_exc.DBError(), oslo_messaging.RemoteError(), l3agentscheduler.RouterReschedulingFailed(router_id='f', agent_id='f'), - ValueError('this raises') + ValueError('this raises'), + Exception() ]).start() - # these first three should not raise any errors self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err - - # ValueError is not caught so it should raise - self.assertRaises(ValueError, - self._take_down_agent_and_run_reschedule, - L3_HOSTA) + self._take_down_agent_and_run_reschedule(L3_HOSTA) # Value error + self._take_down_agent_and_run_reschedule(L3_HOSTA) # Exception def test_router_rescheduler_iterates_after_reschedule_failure(self): plugin = manager.NeutronManager.get_service_plugins().get( diff --git a/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py b/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py index 9faa8ab5b..5ee1adb16 100644 --- a/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py +++ b/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py @@ -248,6 +248,15 @@ class TestNetworksFailover(TestDhcpSchedulerBaseTestCase, self.assertIn('foo3', res_ids) self.assertIn('foo4', res_ids) + def test_reschedule_network_from_down_agent_failed_on_unexpected(self): + agents = self._create_and_set_agents_down(['host-a'], 1) + self._test_schedule_bind_network([agents[0]], self.network_id) + with mock.patch.object( + self, '_filter_bindings', + side_effect=Exception()): + # just make sure that no exception is raised + self.remove_networks_from_down_agents() + class DHCPAgentWeightSchedulerTestCase(TestDhcpSchedulerBaseTestCase): """Unit test scenarios for WeightScheduler.schedule.""" -- 2.45.2