From: Eugene Nikanorov Date: Wed, 6 May 2015 21:06:09 +0000 (+0400) Subject: Add periodic agents health check. X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=8ee51f253cac950787273ca117611b4660ed4523;p=openstack-build%2Fneutron-build.git Add periodic agents health check. In addition to periodic checks of L3 and DHCP agents add periodic checks of overall health of registered agents. Log total count of agents at debug level so it can be seen in logs of neutron-server. In case some agents found dead - log detailed info about them: Type of agent, last heartbeat, host. Change-Id: I5db81dad4e9e8325ad3fa3a3e6d5d2d0deb297dd Closes-Bug: #1453320 --- diff --git a/neutron/db/agents_db.py b/neutron/db/agents_db.py index 9417d5e3c..453a858fe 100644 --- a/neutron/db/agents_db.py +++ b/neutron/db/agents_db.py @@ -26,6 +26,7 @@ from sqlalchemy import sql from neutron.api.v2 import attributes from neutron.common import constants +from neutron import context from neutron.db import model_base from neutron.db import models_v2 from neutron.extensions import agent as ext_agent @@ -191,6 +192,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase): agents = [agent for agent in agents if agent['alive'] == alive] return agents + def agent_health_check(self): + """Scan agents and log if some are considered dead.""" + agents = self.get_agents(context.get_admin_context(), + filters={'admin_state_up': [True]}) + dead_agents = [agent for agent in agents if not agent['alive']] + if dead_agents: + data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host") + data += '\n'.join(['%20s %20s %s' % + (agent['agent_type'], + agent['heartbeat_timestamp'], + agent['host']) for agent in dead_agents]) + LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents " + "out of %(total)s:\n%(data)s"), + {'count': len(dead_agents), + 'total': len(agents), + 'data': data}) + else: + LOG.debug("Agent healthcheck: found %s active agents", + len(agents)) + def _get_agent_by_type_and_host(self, context, agent_type, host): query = self._model_query(context, Agent) try: diff --git a/neutron/db/agentschedulers_db.py b/neutron/db/agentschedulers_db.py index 591db0d29..924cdb416 100644 --- a/neutron/db/agentschedulers_db.py +++ b/neutron/db/agentschedulers_db.py @@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin): original_agent['host']) return result - def setup_agent_status_check(self, function): - self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall( - function) + def add_agent_status_check(self, function): + loop = loopingcall.FixedIntervalLoopingCall(function) # TODO(enikanorov): make interval configurable rather than computed interval = max(cfg.CONF.agent_down_time // 2, 1) # add random initial delay to allow agents to check in after the # neutron server first starts. random to offset multiple servers initial_delay = random.randint(interval, interval * 2) - self.periodic_agent_loop.start(interval=interval, - initial_delay=initial_delay) + loop.start(interval=interval, initial_delay=initial_delay) + + if hasattr(self, 'periodic_agent_loops'): + self.periodic_agent_loops.append(loop) + else: + self.periodic_agent_loops = [loop] def agent_dead_limit_seconds(self): return cfg.CONF.agent_down_time * 2 @@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler "automatic network rescheduling is disabled.")) return - self.setup_agent_status_check(self.remove_networks_from_down_agents) + self.add_agent_status_check(self.remove_networks_from_down_agents) def is_eligible_agent(self, context, active, agent): # eligible agent is active or starting up diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index 0accdd7db..4ccde0bda 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, "automatic router rescheduling is disabled.")) return - self.setup_agent_status_check( + self.add_agent_status_check( self.reschedule_routers_from_down_agents) def reschedule_routers_from_down_agents(self): diff --git a/neutron/plugins/ml2/plugin.py b/neutron/plugins/ml2/plugin.py index a8a406b05..3a1e64fe5 100644 --- a/neutron/plugins/ml2/plugin.py +++ b/neutron/plugins/ml2/plugin.py @@ -148,6 +148,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2, self.mechanism_manager.initialize() self._setup_dhcp() self._start_rpc_notifiers() + self.add_agent_status_check(self.agent_health_check) LOG.info(_LI("Modular L2 Plugin initialization complete")) def _setup_rpc(self): diff --git a/neutron/tests/base.py b/neutron/tests/base.py index cd79f3eeb..d7dd976b6 100644 --- a/neutron/tests/base.py +++ b/neutron/tests/base.py @@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture): 'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.' 'start_periodic_dhcp_agent_status_check') self.patched_dhcp_periodic = self.dhcp_periodic_p.start() + self.agent_health_check_p = mock.patch( + 'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.' + 'add_agent_status_check') + self.agent_health_check = self.agent_health_check_p.start() # Plugin cleanup should be triggered last so that # test-specific cleanup has a chance to release references. self.addCleanup(self.cleanup_core_plugin) diff --git a/neutron/tests/unit/db/test_agents_db.py b/neutron/tests/unit/db/test_agents_db.py index 3aeea2b3a..cabae4315 100644 --- a/neutron/tests/unit/db/test_agents_db.py +++ b/neutron/tests/unit/db/test_agents_db.py @@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase): agent = self.plugin.get_agents(self.context)[0] self.assertFalse(agent['admin_state_up']) + def test_agent_health_check(self): + agents = [{'agent_type': "DHCP Agent", + 'heartbeat_timestamp': '2015-05-06 22:40:40.432295', + 'host': 'some.node', + 'alive': True}] + with mock.patch.object(self.plugin, 'get_agents', + return_value=agents),\ + mock.patch.object(agents_db.LOG, 'warn') as warn,\ + mock.patch.object(agents_db.LOG, 'debug') as debug: + self.plugin.agent_health_check() + self.assertTrue(debug.called) + self.assertFalse(warn.called) + agents[0]['alive'] = False + self.plugin.agent_health_check() + warn.assert_called_once_with( + mock.ANY, + {'count': 1, 'total': 1, + 'data': " Type Last heartbeat host\n" + " DHCP Agent 2015-05-06 22:40:40.432295 some.node"} + ) + class TestAgentsDbGetAgents(TestAgentsDbBase): scenarios = [