]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Catch exceptions in router rescheduler
authorKevin Benton <blak111@gmail.com>
Tue, 30 Sep 2014 02:33:06 +0000 (19:33 -0700)
committerKevin Benton <blak111@gmail.com>
Tue, 14 Oct 2014 09:16:16 +0000 (02:16 -0700)
Catch and log exceptions in router rescheduling loop
rather than just dying which would stop all future
router rescheduling attempts.

This prevents transient DB connectivity issues from
permanently breaking the rescheduler until the process
restarts.

Closes-Bug: #1375597
Change-Id: I2ab37847074fa6bbdd2b13fd03b8742996dcfc78

neutron/db/l3_agentschedulers_db.py
neutron/tests/unit/openvswitch/test_agent_scheduler.py

index da827895073a3277edc4bb05c195ece327527555..bef3119765ab8bd27269397fec6f1a7f344a9e62 100644 (file)
@@ -26,6 +26,7 @@ from sqlalchemy.orm import joinedload
 from sqlalchemy import sql
 
 from neutron.common import constants
+from neutron.common import rpc as n_rpc
 from neutron.common import utils as n_utils
 from neutron import context as n_ctx
 from neutron.db import agents_db
@@ -34,7 +35,7 @@ from neutron.db import l3_attrs_db
 from neutron.db import model_base
 from neutron.extensions import l3agentscheduler
 from neutron import manager
-from neutron.openstack.common.gettextutils import _LI, _LW
+from neutron.openstack.common.gettextutils import _LE, _LI, _LW
 from neutron.openstack.common import log as logging
 from neutron.openstack.common import loopingcall
 from neutron.openstack.common import timeutils
@@ -122,15 +123,28 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
                       RouterL3AgentBinding.router_id).
             filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
                           l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
-
-        for binding in down_bindings:
-            LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s "
-                         "because the agent did not report to the server in "
-                         "the last %(dead_time)s seconds."),
-                     {'router': binding.router_id,
-                      'agent': binding.l3_agent_id,
-                      'dead_time': agent_dead_limit})
-            self.reschedule_router(context, binding.router_id)
+        try:
+            for binding in down_bindings:
+                LOG.warn(_LW(
+                    "Rescheduling router %(router)s from agent %(agent)s "
+                    "because the agent did not report to the server in "
+                    "the last %(dead_time)s seconds."),
+                    {'router': binding.router_id,
+                     'agent': binding.l3_agent_id,
+                     'dead_time': agent_dead_limit})
+                try:
+                    self.reschedule_router(context, binding.router_id)
+                except (l3agentscheduler.RouterReschedulingFailed,
+                        n_rpc.RemoteError):
+                    # Catch individual router rescheduling errors here
+                    # so one broken one doesn't stop the iteration.
+                    LOG.exception(_LE("Failed to reschedule router %s"),
+                                  binding.router_id)
+        except db_exc.DBError:
+            # Catch DB errors here so a transient DB connectivity issue
+            # doesn't stop the loopingcall.
+            LOG.exception(_LE("Exception encountered during router "
+                              "rescheduling."))
 
     def validate_agent_router_combination(self, context, agent, router):
         """Validate if the router can be correctly assigned to the agent.
index ed4f44023b9156dbf9f54213a6fcad84375e2900..bf5407265882698d26bfe07e0ddd85e2182b6b41 100644 (file)
@@ -19,6 +19,7 @@ import datetime
 
 import mock
 from oslo.config import cfg
+from oslo.db import exception as db_exc
 from webob import exc
 
 from neutron.api import extensions
@@ -27,6 +28,7 @@ from neutron.api.rpc.handlers import dhcp_rpc
 from neutron.api.rpc.handlers import l3_rpc
 from neutron.api.v2 import attributes
 from neutron.common import constants
+from neutron.common import rpc as n_rpc
 from neutron import context
 from neutron.db import agents_db
 from neutron.db import l3_agentschedulers_db
@@ -648,6 +650,53 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
         agt_db.admin_state_up = state
         self.adminContext.session.commit()
 
+    def test_router_rescheduler_catches_rpc_db_and_reschedule_exceptions(self):
+        with self.router():
+            l3_rpc_cb = l3_rpc.L3RpcCallback()
+            self._register_agent_states()
+            # schedule the router to host A
+            l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
+
+            plugin = manager.NeutronManager.get_service_plugins().get(
+                service_constants.L3_ROUTER_NAT)
+            mock.patch.object(
+                plugin, 'reschedule_router',
+                side_effect=[
+                    db_exc.DBError(), n_rpc.RemoteError(),
+                    l3agentscheduler.RouterReschedulingFailed(router_id='f',
+                                                              agent_id='f'),
+                    ValueError('this raises')
+                ]).start()
+            # these first three should not raise any errors
+            self._take_down_agent_and_run_reschedule(L3_HOSTA)  # DBError
+            self._take_down_agent_and_run_reschedule(L3_HOSTA)  # RemoteError
+            self._take_down_agent_and_run_reschedule(L3_HOSTA)  # schedule err
+
+            # ValueError is not caught so it should raise
+            self.assertRaises(ValueError,
+                              self._take_down_agent_and_run_reschedule,
+                              L3_HOSTA)
+
+    def test_router_rescheduler_iterates_after_reschedule_failure(self):
+        plugin = manager.NeutronManager.get_service_plugins().get(
+            service_constants.L3_ROUTER_NAT)
+        l3_rpc_cb = l3_rpc.L3RpcCallback()
+        self._register_agent_states()
+        with contextlib.nested(self.router(), self.router()) as (r1, r2):
+            # schedule the routers to host A
+            l3_rpc_cb.sync_routers(self.adminContext, host=L3_HOSTA)
+
+            rs_mock = mock.patch.object(
+                plugin, 'reschedule_router',
+                side_effect=l3agentscheduler.RouterReschedulingFailed(
+                    router_id='f', agent_id='f'),
+            ).start()
+            self._take_down_agent_and_run_reschedule(L3_HOSTA)
+            # make sure both had a reschedule attempt even though first failed
+            rs_mock.assert_has_calls([mock.call(mock.ANY, r1['router']['id']),
+                                      mock.call(mock.ANY, r2['router']['id'])],
+                                     any_order=True)
+
     def test_router_is_not_rescheduled_from_alive_agent(self):
         with self.router():
             l3_rpc_cb = l3_rpc.L3RpcCallback()