]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Add periodic agents health check.
authorEugene Nikanorov <enikanorov@mirantis.com>
Wed, 6 May 2015 21:06:09 +0000 (01:06 +0400)
committerenikanorov <enikanorov@mirantis.com>
Thu, 1 Oct 2015 09:21:54 +0000 (09:21 +0000)
In addition to periodic checks of L3 and DHCP agents
add periodic checks of overall health of registered agents.
Log total count of agents at debug level so it can be
seen in logs of neutron-server.
In case some agents found dead - log detailed info about them:
Type of agent, last heartbeat, host.

Change-Id: I5db81dad4e9e8325ad3fa3a3e6d5d2d0deb297dd
Closes-Bug: #1453320

neutron/db/agents_db.py
neutron/db/agentschedulers_db.py
neutron/db/l3_agentschedulers_db.py
neutron/plugins/ml2/plugin.py
neutron/tests/base.py
neutron/tests/unit/db/test_agents_db.py

index 9417d5e3c37d407f35cb406dea9c1d7105b140ec..453a858feed97bae7abd6961b824eecfeefb0ff0 100644 (file)
@@ -26,6 +26,7 @@ from sqlalchemy import sql
 
 from neutron.api.v2 import attributes
 from neutron.common import constants
+from neutron import context
 from neutron.db import model_base
 from neutron.db import models_v2
 from neutron.extensions import agent as ext_agent
@@ -191,6 +192,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
             agents = [agent for agent in agents if agent['alive'] == alive]
         return agents
 
+    def agent_health_check(self):
+        """Scan agents and log if some are considered dead."""
+        agents = self.get_agents(context.get_admin_context(),
+                                 filters={'admin_state_up': [True]})
+        dead_agents = [agent for agent in agents if not agent['alive']]
+        if dead_agents:
+            data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host")
+            data += '\n'.join(['%20s %20s %s' %
+                               (agent['agent_type'],
+                                agent['heartbeat_timestamp'],
+                                agent['host']) for agent in dead_agents])
+            LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents "
+                         "out of %(total)s:\n%(data)s"),
+                     {'count': len(dead_agents),
+                      'total': len(agents),
+                      'data': data})
+        else:
+            LOG.debug("Agent healthcheck: found %s active agents",
+                      len(agents))
+
     def _get_agent_by_type_and_host(self, context, agent_type, host):
         query = self._model_query(context, Agent)
         try:
index 591db0d29725dae0dd4b26f7b758701020b2288b..924cdb41699ba5f846ec3330b369640ab0cdc317 100644 (file)
@@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin):
                                          original_agent['host'])
         return result
 
-    def setup_agent_status_check(self, function):
-        self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
-            function)
+    def add_agent_status_check(self, function):
+        loop = loopingcall.FixedIntervalLoopingCall(function)
         # TODO(enikanorov): make interval configurable rather than computed
         interval = max(cfg.CONF.agent_down_time // 2, 1)
         # add random initial delay to allow agents to check in after the
         # neutron server first starts. random to offset multiple servers
         initial_delay = random.randint(interval, interval * 2)
-        self.periodic_agent_loop.start(interval=interval,
-            initial_delay=initial_delay)
+        loop.start(interval=interval, initial_delay=initial_delay)
+
+        if hasattr(self, 'periodic_agent_loops'):
+            self.periodic_agent_loops.append(loop)
+        else:
+            self.periodic_agent_loops = [loop]
 
     def agent_dead_limit_seconds(self):
         return cfg.CONF.agent_down_time * 2
@@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
                          "automatic network rescheduling is disabled."))
             return
 
-        self.setup_agent_status_check(self.remove_networks_from_down_agents)
+        self.add_agent_status_check(self.remove_networks_from_down_agents)
 
     def is_eligible_agent(self, context, active, agent):
         # eligible agent is active or starting up
index 0accdd7db7f815be4d8ffd1c0a54e162988c3cb0..4ccde0bdaf5f35d4fe0fdaf84029055b7df151af 100644 (file)
@@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
                          "automatic router rescheduling is disabled."))
             return
 
-        self.setup_agent_status_check(
+        self.add_agent_status_check(
             self.reschedule_routers_from_down_agents)
 
     def reschedule_routers_from_down_agents(self):
index a8a406b05d381ade979e5d6239421d008e25c6d9..3a1e64fe544c03cc7e644fe7e6c83e3c3224e4f5 100644 (file)
@@ -148,6 +148,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2,
         self.mechanism_manager.initialize()
         self._setup_dhcp()
         self._start_rpc_notifiers()
+        self.add_agent_status_check(self.agent_health_check)
         LOG.info(_LI("Modular L2 Plugin initialization complete"))
 
     def _setup_rpc(self):
index cd79f3eebbf6bf67e275ca5abb551dca405651dd..d7dd976b6dba4c1919858e7f28685a4aaa5f7829 100644 (file)
@@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture):
             'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
             'start_periodic_dhcp_agent_status_check')
         self.patched_dhcp_periodic = self.dhcp_periodic_p.start()
+        self.agent_health_check_p = mock.patch(
+            'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
+            'add_agent_status_check')
+        self.agent_health_check = self.agent_health_check_p.start()
         # Plugin cleanup should be triggered last so that
         # test-specific cleanup has a chance to release references.
         self.addCleanup(self.cleanup_core_plugin)
index 3aeea2b3ab4ef8971709639971ace5d40f128509..cabae43159ec3df0ff41229e42393db35cd3c7d9 100644 (file)
@@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase):
         agent = self.plugin.get_agents(self.context)[0]
         self.assertFalse(agent['admin_state_up'])
 
+    def test_agent_health_check(self):
+        agents = [{'agent_type': "DHCP Agent",
+                   'heartbeat_timestamp': '2015-05-06 22:40:40.432295',
+                   'host': 'some.node',
+                   'alive': True}]
+        with mock.patch.object(self.plugin, 'get_agents',
+                               return_value=agents),\
+                mock.patch.object(agents_db.LOG, 'warn') as warn,\
+                mock.patch.object(agents_db.LOG, 'debug') as debug:
+            self.plugin.agent_health_check()
+            self.assertTrue(debug.called)
+            self.assertFalse(warn.called)
+            agents[0]['alive'] = False
+            self.plugin.agent_health_check()
+            warn.assert_called_once_with(
+                mock.ANY,
+                {'count': 1, 'total': 1,
+                 'data': "                Type       Last heartbeat host\n"
+                 "          DHCP Agent 2015-05-06 22:40:40.432295 some.node"}
+            )
+
 
 class TestAgentsDbGetAgents(TestAgentsDbBase):
     scenarios = [