From e50e1a236983e0a59b9667bc546c92555c3d0e34 Mon Sep 17 00:00:00 2001 From: Eugene Nikanorov Date: Tue, 5 May 2015 18:18:28 +0400 Subject: [PATCH] Add logging of agent heartbeats When troubleshooting problems with cluster it would be very convenient to have information about agent heartbeats logged with some searchable identifier which could create 1-to-1 mapping between events in agent's logs and server's logs. Currently agent's heartbeats are not logged at all on server side. Since on a large cluster that could create too much logging (even for troubleshooting cases), it might make sense to make this configurable both on neutron-server side and on agent-side. DocImpact Change-Id: I0a127ef274a84bba5de47395d47b62f48bd4be16 Closes-Bug: #1452582 --- etc/dhcp_agent.ini | 4 +++ etc/l3_agent.ini | 4 +++ etc/metadata_agent.ini | 4 +++ etc/neutron/plugins/ml2/openvswitch_agent.ini | 3 +++ neutron/agent/common/config.py | 2 ++ neutron/agent/dhcp/agent.py | 3 ++- neutron/agent/l3/agent.py | 3 ++- neutron/agent/metadata/agent.py | 1 + neutron/agent/rpc.py | 6 +++++ neutron/db/agents_db.py | 26 ++++++++++++++----- .../openvswitch/agent/ovs_neutron_agent.py | 4 ++- neutron/tests/unit/db/test_agents_db.py | 12 +++++++++ 12 files changed, 62 insertions(+), 10 deletions(-) diff --git a/etc/dhcp_agent.ini b/etc/dhcp_agent.ini index 0b3721b95..0f56260f4 100644 --- a/etc/dhcp_agent.ini +++ b/etc/dhcp_agent.ini @@ -90,3 +90,7 @@ # Timeout for ovs-vsctl commands. # If the timeout expires, ovs commands will fail with ALARMCLOCK error. # ovs_vsctl_timeout = 10 + +[AGENT] +# Log agent heartbeats from this DHCP agent +# log_agent_heartbeats = False diff --git a/etc/l3_agent.ini b/etc/l3_agent.ini index b49797d02..310b6b59e 100644 --- a/etc/l3_agent.ini +++ b/etc/l3_agent.ini @@ -122,3 +122,7 @@ # The advertisement interval in seconds # ha_vrrp_advert_int = 2 + +[AGENT] +# Log agent heartbeats from this L3 agent +# log_agent_heartbeats = False diff --git a/etc/metadata_agent.ini b/etc/metadata_agent.ini index 4a0331ee1..ca31c7fe9 100644 --- a/etc/metadata_agent.ini +++ b/etc/metadata_agent.ini @@ -66,3 +66,7 @@ admin_password = %SERVICE_PASSWORD% # Otherwise default_ttl specifies time in seconds a cache entry is valid for. # No cache is used in case no value is passed. # cache_url = memory://?default_ttl=5 + +[AGENT] +# Log agent heartbeats from this Metadata agent +# log_agent_heartbeats = False diff --git a/etc/neutron/plugins/ml2/openvswitch_agent.ini b/etc/neutron/plugins/ml2/openvswitch_agent.ini index 85586c596..5dd11a8ce 100644 --- a/etc/neutron/plugins/ml2/openvswitch_agent.ini +++ b/etc/neutron/plugins/ml2/openvswitch_agent.ini @@ -58,6 +58,9 @@ # of_interface = ovs-ofctl [agent] +# Log agent heartbeats from this OVS agent +# log_agent_heartbeats = False + # Agent's polling interval in seconds # polling_interval = 2 diff --git a/neutron/agent/common/config.py b/neutron/agent/common/config.py index 7e63ea387..b563c969e 100644 --- a/neutron/agent/common/config.py +++ b/neutron/agent/common/config.py @@ -44,6 +44,8 @@ AGENT_STATE_OPTS = [ help=_('Seconds between nodes reporting state to server; ' 'should be less than agent_down_time, best if it ' 'is half or less than agent_down_time.')), + cfg.BoolOpt('log_agent_heartbeats', default=False, + help=_('Log agent heartbeats')), ] INTERFACE_DRIVER_OPTS = [ diff --git a/neutron/agent/dhcp/agent.py b/neutron/agent/dhcp/agent.py index 6b5ac5ac7..2ca6da540 100644 --- a/neutron/agent/dhcp/agent.py +++ b/neutron/agent/dhcp/agent.py @@ -548,7 +548,8 @@ class DhcpAgentWithStateReport(DhcpAgent): 'configurations': { 'dhcp_driver': cfg.CONF.dhcp_driver, 'use_namespaces': cfg.CONF.use_namespaces, - 'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration}, + 'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration, + 'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats}, 'start_flag': True, 'agent_type': constants.AGENT_TYPE_DHCP} report_interval = cfg.CONF.AGENT.report_interval diff --git a/neutron/agent/l3/agent.py b/neutron/agent/l3/agent.py index a1aec1488..03a04e7b9 100644 --- a/neutron/agent/l3/agent.py +++ b/neutron/agent/l3/agent.py @@ -595,7 +595,8 @@ class L3NATAgentWithStateReport(L3NATAgent): 'external_network_bridge': self.conf.external_network_bridge, 'gateway_external_network_id': self.conf.gateway_external_network_id, - 'interface_driver': self.conf.interface_driver}, + 'interface_driver': self.conf.interface_driver, + 'log_agent_heartbeats': self.conf.AGENT.log_agent_heartbeats}, 'start_flag': True, 'agent_type': l3_constants.AGENT_TYPE_L3} report_interval = self.conf.AGENT.report_interval diff --git a/neutron/agent/metadata/agent.py b/neutron/agent/metadata/agent.py index 769d8039b..c26626c9a 100644 --- a/neutron/agent/metadata/agent.py +++ b/neutron/agent/metadata/agent.py @@ -289,6 +289,7 @@ class UnixDomainMetadataProxy(object): 'metadata_proxy_socket': cfg.CONF.metadata_proxy_socket, 'nova_metadata_ip': cfg.CONF.nova_metadata_ip, 'nova_metadata_port': cfg.CONF.nova_metadata_port, + 'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats, }, 'start_flag': True, 'agent_type': n_const.AGENT_TYPE_METADATA} diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py index f71d36032..11bf79784 100644 --- a/neutron/agent/rpc.py +++ b/neutron/agent/rpc.py @@ -18,6 +18,7 @@ import itertools from oslo_log import log as logging import oslo_messaging from oslo_utils import timeutils +from oslo_utils import uuidutils from neutron.common import constants from neutron.common import rpc as n_rpc @@ -72,6 +73,11 @@ class PluginReportStateAPI(object): def report_state(self, context, agent_state, use_call=False): cctxt = self.client.prepare() + # add unique identifier to a report + # that can be logged on server side. + # This create visible correspondence between events on + # the agent and on the server + agent_state['uuid'] = uuidutils.generate_uuid() kwargs = { 'agent_state': {'agent_state': agent_state}, 'time': timeutils.strtime(), diff --git a/neutron/db/agents_db.py b/neutron/db/agents_db.py index 52dccf5c4..702f2e497 100644 --- a/neutron/db/agents_db.py +++ b/neutron/db/agents_db.py @@ -29,7 +29,7 @@ from neutron.common import constants from neutron.db import model_base from neutron.db import models_v2 from neutron.extensions import agent as ext_agent -from neutron.i18n import _LE, _LW +from neutron.i18n import _LE, _LI, _LW from neutron import manager LOG = logging.getLogger(__name__) @@ -203,22 +203,33 @@ class AgentDbMixin(ext_agent.AgentPluginBase): agent = self._get_agent(context, id) return self._make_agent_dict(agent, fields) - def _create_or_update_agent(self, context, agent): + def _log_heartbeat(self, state, agent_db, agent_conf): + if agent_conf.get('log_agent_heartbeats'): + delta = timeutils.utcnow() - agent_db.heartbeat_timestamp + LOG.info(_LI("Heartbeat received from %(type)s agent on " + "host %(host)s, uuid %(uuid)s after %(delta)s"), + {'type': agent_db.agent_type, + 'host': agent_db.host, + 'uuid': state.get('uuid'), + 'delta': delta}) + + def _create_or_update_agent(self, context, agent_state): with context.session.begin(subtransactions=True): res_keys = ['agent_type', 'binary', 'host', 'topic'] - res = dict((k, agent[k]) for k in res_keys) + res = dict((k, agent_state[k]) for k in res_keys) - configurations_dict = agent.get('configurations', {}) + configurations_dict = agent_state.get('configurations', {}) res['configurations'] = jsonutils.dumps(configurations_dict) - res['load'] = self._get_agent_load(agent) + res['load'] = self._get_agent_load(agent_state) current_time = timeutils.utcnow() try: agent_db = self._get_agent_by_type_and_host( - context, agent['agent_type'], agent['host']) + context, agent_state['agent_type'], agent_state['host']) res['heartbeat_timestamp'] = current_time - if agent.get('start_flag'): + if agent_state.get('start_flag'): res['started_at'] = current_time greenthread.sleep(0) + self._log_heartbeat(agent_state, agent_db, configurations_dict) agent_db.update(res) except ext_agent.AgentNotFoundByTypeHost: greenthread.sleep(0) @@ -229,6 +240,7 @@ class AgentDbMixin(ext_agent.AgentPluginBase): agent_db = Agent(**res) greenthread.sleep(0) context.session.add(agent_db) + self._log_heartbeat(agent_state, agent_db, configurations_dict) greenthread.sleep(0) def create_or_update_agent(self, context, agent): diff --git a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py index ca52b2162..b9ad0dd35 100644 --- a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py @@ -200,7 +200,9 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin, 'arp_responder_enabled': self.arp_responder_enabled, 'enable_distributed_routing': - self.enable_distributed_routing}, + self.enable_distributed_routing, + 'log_agent_heartbeats': + self.conf.AGENT.log_agent_heartbeats}, 'agent_type': q_const.AGENT_TYPE_OVS, 'start_flag': True} diff --git a/neutron/tests/unit/db/test_agents_db.py b/neutron/tests/unit/db/test_agents_db.py index cda8cd33b..a47266314 100644 --- a/neutron/tests/unit/db/test_agents_db.py +++ b/neutron/tests/unit/db/test_agents_db.py @@ -127,6 +127,18 @@ class TestAgentsDbMixin(TestAgentsDbBase): agent = agents[0] self._assert_ref_fields_are_equal(self.agent_status, agent) + def test_create_or_update_agent_logs_heartbeat(self): + status = self.agent_status.copy() + status['configurations'] = {'log_agent_heartbeats': True} + + with mock.patch.object(agents_db.LOG, 'info') as info: + self.plugin.create_or_update_agent(self.context, status) + self.assertTrue(info.called) + status['configurations'] = {'log_agent_heartbeats': False} + info.reset_mock() + self.plugin.create_or_update_agent(self.context, status) + self.assertFalse(info.called) + def test_create_or_update_agent_concurrent_insert(self): # NOTE(rpodolyaka): emulate violation of the unique constraint caused # by a concurrent insert. Ensure we make another -- 2.45.2