]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Option to remove routers from dead l3 agents
authorKevin Benton <blak111@gmail.com>
Wed, 30 Jul 2014 22:49:59 +0000 (15:49 -0700)
committerKevin Benton <blak111@gmail.com>
Wed, 13 Aug 2014 17:26:22 +0000 (10:26 -0700)
Add a configuration-enabled periodic check to examine the
status of all L3 agents with routers scheduled to them and
admin_state_up set to True. If the agent is dead, the router
will be rescheduled to an alive agent.

Neutron considers and agent 'dead' when the server doesn't
receive any heartbeat messages from the agent over the
RPC channel within a given number of seconds (agent_down_time).
There are various false positive scenarios where the agent may
fail to report even though the node is still forwarding traffic.

This is configuration driven because a dead L3 agent with active
namespaces forwarding traffic and responding to ARP requests may
cause issues. If the network backend does not block the dead
agent's node from using the router's IP addresses, there will be
a conflict between the old and new namespace.

This conflict should not break east-west traffic because both
namespaces will be attached to the appropriate networks and
either can forward the traffic without state. However, traffic
being overloaded onto the router's external network interface
IP in north-south traffic will be impacted because the matching
translation for port address translation will only be present
on one router. Additionally, floating IPs associated to ports
after the rescheduling will not work traversing the old
namespace because the mapping will not be present.

DocImpact

Partial-Bug: #1174591
Change-Id: Id7d487f54ca54fdd46b7616c0969319afc0bb589

etc/neutron.conf
neutron/db/l3_agentschedulers_db.py
neutron/services/l3_router/l3_router_plugin.py
neutron/tests/unit/openvswitch/test_agent_scheduler.py

index eadcf5e1f93cebe2d26bd52902d71d1c6dbad777..f552c4325d0ccdf46796d730149c0cae62a028b9 100644 (file)
@@ -160,6 +160,10 @@ lock_path = $state_path/lock
 # routers to first L3 agent which sends sync_routers message to neutron server
 # router_auto_schedule = True
 
+# Allow automatic rescheduling of routers from dead L3 agents with
+# admin_state_up set to True to alive agents.
+# allow_automatic_l3agent_failover = False
+
 # Number of DHCP agents scheduled to host a network. This enables redundant
 # DHCP agents for configured networks.
 # dhcp_agents_per_network = 1
index 3d6927218ee3fd4c7dbda3754b5e9f89528e84a1..fe9f6734c9c6b6e7bea7e14ee97d18384ebb483e 100644 (file)
@@ -12,6 +12,9 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
+import datetime
+import random
+import time
 
 from oslo.config import cfg
 import sqlalchemy as sa
@@ -21,11 +24,19 @@ from sqlalchemy.orm import exc
 from sqlalchemy.orm import joinedload
 
 from neutron.common import constants
+from neutron import context as n_ctx
 from neutron.db import agents_db
 from neutron.db import agentschedulers_db
 from neutron.db import model_base
 from neutron.extensions import l3agentscheduler
 from neutron import manager
+from neutron.openstack.common.gettextutils import _LI, _LW
+from neutron.openstack.common import log as logging
+from neutron.openstack.common import loopingcall
+from neutron.openstack.common import timeutils
+
+
+LOG = logging.getLogger(__name__)
 
 L3_AGENTS_SCHEDULER_OPTS = [
     cfg.StrOpt('router_scheduler_driver',
@@ -34,6 +45,9 @@ L3_AGENTS_SCHEDULER_OPTS = [
                       'router to a default L3 agent')),
     cfg.BoolOpt('router_auto_schedule', default=True,
                 help=_('Allow auto scheduling of routers to L3 agent.')),
+    cfg.BoolOpt('allow_automatic_l3agent_failover', default=False,
+                help=_('Automatically reschedule routers from offline L3 '
+                       'agents to online L3 agents.')),
 ]
 
 cfg.CONF.register_opts(L3_AGENTS_SCHEDULER_OPTS)
@@ -59,6 +73,54 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
 
     router_scheduler = None
 
+    def start_periodic_agent_status_check(self):
+        if not cfg.CONF.allow_automatic_l3agent_failover:
+            LOG.info(_LI("Skipping period L3 agent status check because "
+                         "automatic router rescheduling is disabled."))
+            return
+
+        self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
+            self.reschedule_routers_from_down_agents)
+        interval = max(cfg.CONF.agent_down_time / 2, 1)
+        # add random initial delay to allow agents to check in after the
+        # neutron server first starts. random to offset multiple servers
+        self.periodic_agent_loop.start(interval=interval,
+            initial_delay=random.randint(interval, interval * 2))
+
+    def reschedule_routers_from_down_agents(self):
+        """Reschedule routers from down l3 agents if admin state is up."""
+
+        # give agents extra time to handle transient failures
+        agent_dead_limit = cfg.CONF.agent_down_time * 2
+
+        # check for an abrupt clock change since last check. if a change is
+        # detected, sleep for a while to let the agents check in.
+        tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
+                                              timeutils.utcnow())
+        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
+            LOG.warn(_LW("Time since last L3 agent reschedule check has "
+                         "exceeded the interval between checks. Waiting "
+                         "before check to allow agents to send a heartbeat "
+                         "in case there was a clock adjustment."))
+            time.sleep(agent_dead_limit)
+        self._clock_jump_canary = timeutils.utcnow()
+
+        context = n_ctx.get_admin_context()
+        cutoff = timeutils.utcnow() - datetime.timedelta(
+            seconds=agent_dead_limit)
+        down_bindings = (
+            context.session.query(RouterL3AgentBinding).
+            filter(agents_db.Agent.heartbeat_timestamp < cutoff,
+                   agents_db.Agent.admin_state_up))
+        for binding in down_bindings:
+            LOG.warn(_LW("Rescheduling router %(router)s from agent %(agent)s "
+                         "because the agent did not report to the server in "
+                         "the last %(dead_time)s seconds."),
+                     {'router': binding.router_id,
+                      'agent': binding.l3_agent_id,
+                      'dead_time': agent_dead_limit})
+            self.reschedule_router(context, binding.router_id)
+
     def add_router_to_l3_agent(self, context, agent_id, router_id):
         """Add a l3 agent to host a router."""
         router = self.get_router(context, router_id)
index 61614d684a06d1f8445575effabafc6fb1c15354..f4f4b779a26cac884bc400b8bad21a165aed6521 100644 (file)
@@ -64,6 +64,7 @@ class L3RouterPlugin(common_db_mixin.CommonDbMixin,
         self.setup_rpc()
         self.router_scheduler = importutils.import_object(
             cfg.CONF.router_scheduler_driver)
+        self.start_periodic_agent_status_check()
 
     def setup_rpc(self):
         # RPC support
index c9057ed33639923e3ec9897bef3219f00ebd10ff..ab855b61c932279c3bf8602c77d7738fc056a1cb 100644 (file)
@@ -15,6 +15,7 @@
 
 import contextlib
 import copy
+import datetime
 
 import mock
 from oslo.config import cfg
@@ -27,6 +28,7 @@ from neutron.common import constants
 from neutron import context
 from neutron.db import agents_db
 from neutron.db import dhcp_rpc_base
+from neutron.db import l3_agentschedulers_db
 from neutron.db import l3_rpc_base
 from neutron.extensions import agent
 from neutron.extensions import dhcpagentscheduler
@@ -231,6 +233,9 @@ class OvsAgentSchedulerTestCaseBase(test_l3_plugin.L3NatTestCaseMixin,
         self.l3_notify_p = mock.patch(
             'neutron.extensions.l3agentscheduler.notify')
         self.patched_l3_notify = self.l3_notify_p.start()
+        self.l3_periodic_p = mock.patch('neutron.db.L3AgentSchedulerDbMixin.'
+                                        'start_periodic_agent_status_check')
+        self.patched_l3_periodic = self.l3_notify_p.start()
         self.dhcp_notify_p = mock.patch(
             'neutron.extensions.dhcpagentscheduler.notify')
         self.patched_dhcp_notify = self.dhcp_notify_p.start()
@@ -617,6 +622,61 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
             self.assertEqual(port_list['ports'][0]['device_id'],
                              constants.DEVICE_ID_RESERVED_DHCP_PORT)
 
+    def _take_down_agent_and_run_reschedule(self, host):
+        # take down the agent on host A and ensure B is alive
+        self.adminContext.session.begin(subtransactions=True)
+        query = self.adminContext.session.query(agents_db.Agent)
+        agt = query.filter_by(host=host).first()
+        agt.heartbeat_timestamp = (
+            agt.heartbeat_timestamp - datetime.timedelta(hours=1))
+        self.adminContext.session.commit()
+
+        plugin = manager.NeutronManager.get_service_plugins().get(
+            service_constants.L3_ROUTER_NAT)
+
+        plugin.reschedule_routers_from_down_agents()
+
+    def _set_agent_admin_state_up(self, host, state):
+        self.adminContext.session.begin(subtransactions=True)
+        query = self.adminContext.session.query(agents_db.Agent)
+        agt_db = query.filter_by(host=host).first()
+        agt_db.admin_state_up = state
+        self.adminContext.session.commit()
+
+    def test_router_reschedule_from_dead_agent(self):
+        with self.router():
+            l3_rpc = l3_rpc_base.L3RpcCallbackMixin()
+            self._register_agent_states()
+
+            # schedule the router to host A
+            ret_a = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTA)
+            self._take_down_agent_and_run_reschedule(L3_HOSTA)
+
+            # B should now pick up the router
+            ret_b = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTB)
+        self.assertEqual(ret_b, ret_a)
+
+    def test_router_no_reschedule_from_dead_admin_down_agent(self):
+        with self.router() as r:
+            l3_rpc = l3_rpc_base.L3RpcCallbackMixin()
+            self._register_agent_states()
+
+            # schedule the router to host A
+            l3_rpc.sync_routers(self.adminContext, host=L3_HOSTA)
+            self._set_agent_admin_state_up(L3_HOSTA, False)
+            self._take_down_agent_and_run_reschedule(L3_HOSTA)
+
+            # A should still have it even though it was inactive due to the
+            # admin_state being down
+            rab = l3_agentschedulers_db.RouterL3AgentBinding
+            binding = (self.adminContext.session.query(rab).
+                       filter(rab.router_id == r['router']['id']).first())
+            self.assertEqual(binding.l3_agent.host, L3_HOSTA)
+
+            # B should not pick up the router
+            ret_b = l3_rpc.sync_routers(self.adminContext, host=L3_HOSTB)
+            self.assertFalse(ret_b)
+
     def test_router_auto_schedule_with_invalid_router(self):
         with self.router() as router:
             l3_rpc = l3_rpc_base.L3RpcCallbackMixin()