In addition to periodic checks of L3 and DHCP agents
add periodic checks of overall health of registered agents.
Log total count of agents at debug level so it can be
seen in logs of neutron-server.
In case some agents found dead - log detailed info about them:
Type of agent, last heartbeat, host.
Change-Id: I5db81dad4e9e8325ad3fa3a3e6d5d2d0deb297dd
Closes-Bug: #
1453320
from neutron.api.v2 import attributes
from neutron.common import constants
+from neutron import context
from neutron.db import model_base
from neutron.db import models_v2
from neutron.extensions import agent as ext_agent
agents = [agent for agent in agents if agent['alive'] == alive]
return agents
+ def agent_health_check(self):
+ """Scan agents and log if some are considered dead."""
+ agents = self.get_agents(context.get_admin_context(),
+ filters={'admin_state_up': [True]})
+ dead_agents = [agent for agent in agents if not agent['alive']]
+ if dead_agents:
+ data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host")
+ data += '\n'.join(['%20s %20s %s' %
+ (agent['agent_type'],
+ agent['heartbeat_timestamp'],
+ agent['host']) for agent in dead_agents])
+ LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents "
+ "out of %(total)s:\n%(data)s"),
+ {'count': len(dead_agents),
+ 'total': len(agents),
+ 'data': data})
+ else:
+ LOG.debug("Agent healthcheck: found %s active agents",
+ len(agents))
+
def _get_agent_by_type_and_host(self, context, agent_type, host):
query = self._model_query(context, Agent)
try:
original_agent['host'])
return result
- def setup_agent_status_check(self, function):
- self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
- function)
+ def add_agent_status_check(self, function):
+ loop = loopingcall.FixedIntervalLoopingCall(function)
# TODO(enikanorov): make interval configurable rather than computed
interval = max(cfg.CONF.agent_down_time // 2, 1)
# add random initial delay to allow agents to check in after the
# neutron server first starts. random to offset multiple servers
initial_delay = random.randint(interval, interval * 2)
- self.periodic_agent_loop.start(interval=interval,
- initial_delay=initial_delay)
+ loop.start(interval=interval, initial_delay=initial_delay)
+
+ if hasattr(self, 'periodic_agent_loops'):
+ self.periodic_agent_loops.append(loop)
+ else:
+ self.periodic_agent_loops = [loop]
def agent_dead_limit_seconds(self):
return cfg.CONF.agent_down_time * 2
"automatic network rescheduling is disabled."))
return
- self.setup_agent_status_check(self.remove_networks_from_down_agents)
+ self.add_agent_status_check(self.remove_networks_from_down_agents)
def is_eligible_agent(self, context, active, agent):
# eligible agent is active or starting up
"automatic router rescheduling is disabled."))
return
- self.setup_agent_status_check(
+ self.add_agent_status_check(
self.reschedule_routers_from_down_agents)
def reschedule_routers_from_down_agents(self):
self.mechanism_manager.initialize()
self._setup_dhcp()
self._start_rpc_notifiers()
+ self.add_agent_status_check(self.agent_health_check)
LOG.info(_LI("Modular L2 Plugin initialization complete"))
def _setup_rpc(self):
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
'start_periodic_dhcp_agent_status_check')
self.patched_dhcp_periodic = self.dhcp_periodic_p.start()
+ self.agent_health_check_p = mock.patch(
+ 'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
+ 'add_agent_status_check')
+ self.agent_health_check = self.agent_health_check_p.start()
# Plugin cleanup should be triggered last so that
# test-specific cleanup has a chance to release references.
self.addCleanup(self.cleanup_core_plugin)
agent = self.plugin.get_agents(self.context)[0]
self.assertFalse(agent['admin_state_up'])
+ def test_agent_health_check(self):
+ agents = [{'agent_type': "DHCP Agent",
+ 'heartbeat_timestamp': '2015-05-06 22:40:40.432295',
+ 'host': 'some.node',
+ 'alive': True}]
+ with mock.patch.object(self.plugin, 'get_agents',
+ return_value=agents),\
+ mock.patch.object(agents_db.LOG, 'warn') as warn,\
+ mock.patch.object(agents_db.LOG, 'debug') as debug:
+ self.plugin.agent_health_check()
+ self.assertTrue(debug.called)
+ self.assertFalse(warn.called)
+ agents[0]['alive'] = False
+ self.plugin.agent_health_check()
+ warn.assert_called_once_with(
+ mock.ANY,
+ {'count': 1, 'total': 1,
+ 'data': " Type Last heartbeat host\n"
+ " DHCP Agent 2015-05-06 22:40:40.432295 some.node"}
+ )
+
class TestAgentsDbGetAgents(TestAgentsDbBase):
scenarios = [