From: Kevin Benton Date: Tue, 30 Sep 2014 02:33:06 +0000 (-0700) Subject: Catch exceptions in router rescheduler X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=79f1e8a9c1f308586077483d849e66dcdc83144f;p=openstack-build%2Fneutron-build.git Catch exceptions in router rescheduler Catch and log exceptions in router rescheduling loop rather than just dying which would stop all future router rescheduling attempts. This prevents transient DB connectivity issues from permanently breaking the rescheduler until the process restarts. Closes-Bug: #1375597 Change-Id: I2ab37847074fa6bbdd2b13fd03b8742996dcfc78 --- diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index da8278950..bef311976 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -26,6 +26,7 @@ from sqlalchemy.orm import joinedload from sqlalchemy import sql from neutron.common import constants +from neutron.common import rpc as n_rpc from neutron.common import utils as n_utils from neutron import context as n_ctx from neutron.db import agents_db @@ -34,7 +35,7 @@ from neutron.db import l3_attrs_db from neutron.db import model_base from neutron.extensions import l3agentscheduler from neutron import manager -from neutron.openstack.common.gettextutils import _LI, _LW +from neutron.openstack.common.gettextutils import _LE, _LI, _LW from neutron.openstack.common import log as logging from neutron.openstack.common import loopingcall from neutron.openstack.common import timeutils @@ -122,15 +123,28 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, RouterL3AgentBinding.router_id). filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(), l3_attrs_db.RouterExtraAttributes.ha == sql.null()))) - - for binding in down_bindings: - LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s " - "because the agent did not report to the server in " - "the last %(dead_time)s seconds."), - {'router': binding.router_id, - 'agent': binding.l3_agent_id, - 'dead_time': agent_dead_limit}) - self.reschedule_router(context, binding.router_id) + try: + for binding in down_bindings: + LOG.warn(_LW( + "Rescheduling router %(router)s from agent %(agent)s " + "because the agent did not report to the server in " + "the last %(dead_time)s seconds."), + {'router': binding.router_id, + 'agent': binding.l3_agent_id, + 'dead_time': agent_dead_limit}) + try: + self.reschedule_router(context, binding.router_id) + except (l3agentscheduler.RouterReschedulingFailed, + n_rpc.RemoteError): + # Catch individual router rescheduling errors here + # so one broken one doesn't stop the iteration. + LOG.exception(_LE("Failed to reschedule router %s"), + binding.router_id) + except db_exc.DBError: + # Catch DB errors here so a transient DB connectivity issue + # doesn't stop the loopingcall. + LOG.exception(_LE("Exception encountered during router " + "rescheduling.")) def validate_agent_router_combination(self, context, agent, router): """Validate if the router can be correctly assigned to the agent. diff --git a/neutron/tests/unit/openvswitch/test_agent_scheduler.py b/neutron/tests/unit/openvswitch/test_agent_scheduler.py index ed4f44023..bf5407265 100644 --- a/neutron/tests/unit/openvswitch/test_agent_scheduler.py +++ b/neutron/tests/unit/openvswitch/test_agent_scheduler.py @@ -19,6 +19,7 @@ import datetime import mock from oslo.config import cfg +from oslo.db import exception as db_exc from webob import exc from neutron.api import extensions @@ -27,6 +28,7 @@ from neutron.api.rpc.handlers import dhcp_rpc from neutron.api.rpc.handlers import l3_rpc from neutron.api.v2 import attributes from neutron.common import constants +from neutron.common import rpc as n_rpc from neutron import context from neutron.db import agents_db from neutron.db import l3_agentschedulers_db @@ -648,6 +650,53 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase): agt_db.admin_state_up = state self.adminContext.session.commit() + def test_router_rescheduler_catches_rpc_db_and_reschedule_exceptions(self): + with self.router(): + l3_rpc_cb = l3_rpc.L3RpcCallback() + self._register_agent_states() + # schedule the router to host A + l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA) + + plugin = manager.NeutronManager.get_service_plugins().get( + service_constants.L3_ROUTER_NAT) + mock.patch.object( + plugin, 'reschedule_router', + side_effect=[ + db_exc.DBError(), n_rpc.RemoteError(), + l3agentscheduler.RouterReschedulingFailed(router_id='f', + agent_id='f'), + ValueError('this raises') + ]).start() + # these first three should not raise any errors + self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError + self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError + self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err + + # ValueError is not caught so it should raise + self.assertRaises(ValueError, + self._take_down_agent_and_run_reschedule, + L3_HOSTA) + + def test_router_rescheduler_iterates_after_reschedule_failure(self): + plugin = manager.NeutronManager.get_service_plugins().get( + service_constants.L3_ROUTER_NAT) + l3_rpc_cb = l3_rpc.L3RpcCallback() + self._register_agent_states() + with contextlib.nested(self.router(), self.router()) as (r1, r2): + # schedule the routers to host A + l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA) + + rs_mock = mock.patch.object( + plugin, 'reschedule_router', + side_effect=l3agentscheduler.RouterReschedulingFailed( + router_id='f', agent_id='f'), + ).start() + self._take_down_agent_and_run_reschedule(L3_HOSTA) + # make sure both had a reschedule attempt even though first failed + rs_mock.assert_has_calls([mock.call(mock.ANY, r1['router']['id']), + mock.call(mock.ANY, r2['router']['id'])], + any_order=True) + def test_router_is_not_rescheduled_from_alive_agent(self): with self.router(): l3_rpc_cb = l3_rpc.L3RpcCallback()