]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Add logging of agent heartbeats
authorEugene Nikanorov <enikanorov@mirantis.com>
Tue, 5 May 2015 14:18:28 +0000 (18:18 +0400)
committerEugene Nikanorov <enikanorov@mirantis.com>
Mon, 29 Jun 2015 01:40:26 +0000 (05:40 +0400)
When troubleshooting problems with cluster it would be
very convenient to have information about agent heartbeats
logged with some searchable identifier which could create
1-to-1 mapping between events in agent's logs and server's logs.

Currently agent's heartbeats are not logged at all on server side.
Since on a large cluster that could create too much logging
(even for troubleshooting cases), it might make sense to make
this configurable both on neutron-server side and on agent-side.

DocImpact

Change-Id: I0a127ef274a84bba5de47395d47b62f48bd4be16
Closes-Bug: #1452582

12 files changed:
etc/dhcp_agent.ini
etc/l3_agent.ini
etc/metadata_agent.ini
etc/neutron/plugins/ml2/openvswitch_agent.ini
neutron/agent/common/config.py
neutron/agent/dhcp/agent.py
neutron/agent/l3/agent.py
neutron/agent/metadata/agent.py
neutron/agent/rpc.py
neutron/db/agents_db.py
neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py
neutron/tests/unit/db/test_agents_db.py

index 0b3721b959e59789aa284abb54b5f0aa7cebdc8b..0f56260f4c27b9fdf31e8bef38d28ac7d98f1e35 100644 (file)
@@ -90,3 +90,7 @@
 # Timeout for ovs-vsctl commands.
 # If the timeout expires, ovs commands will fail with ALARMCLOCK error.
 # ovs_vsctl_timeout = 10
+
+[AGENT]
+# Log agent heartbeats from this DHCP agent
+# log_agent_heartbeats = False
index b49797d02fafbe7faa4a1a47ac2374ea9be44658..310b6b59e02518d613b60250ed294cd18d075910 100644 (file)
 
 # The advertisement interval in seconds
 # ha_vrrp_advert_int = 2
+
+[AGENT]
+# Log agent heartbeats from this L3 agent
+# log_agent_heartbeats = False
index 4a0331ee12501e00b6747d6c6590a2358079d2d8..ca31c7fe9763b6c634ddd728da415b891a5890a0 100644 (file)
@@ -66,3 +66,7 @@ admin_password = %SERVICE_PASSWORD%
 # Otherwise default_ttl specifies time in seconds a cache entry is valid for.
 # No cache is used in case no value is passed.
 # cache_url = memory://?default_ttl=5
+
+[AGENT]
+# Log agent heartbeats from this Metadata agent
+# log_agent_heartbeats = False
index 85586c5969b7c7794a739eaebd35eb2801311b3d..5dd11a8ce88859797172ca25131fcb2d32d690ef 100644 (file)
@@ -58,6 +58,9 @@
 # of_interface = ovs-ofctl
 
 [agent]
+# Log agent heartbeats from this OVS agent
+# log_agent_heartbeats = False
+
 # Agent's polling interval in seconds
 # polling_interval = 2
 
index 7e63ea3878982d5d5e1d006d8a6d68ccd47e50fd..b563c969ed5e7efdf5c351aa78ab1b9d2cb7a2a5 100644 (file)
@@ -44,6 +44,8 @@ AGENT_STATE_OPTS = [
                  help=_('Seconds between nodes reporting state to server; '
                         'should be less than agent_down_time, best if it '
                         'is half or less than agent_down_time.')),
+    cfg.BoolOpt('log_agent_heartbeats', default=False,
+                help=_('Log agent heartbeats')),
 ]
 
 INTERFACE_DRIVER_OPTS = [
index 6b5ac5ac7159a0fe6c24a99eab3dc535972bb1c5..2ca6da5405db78b00350bb42005da7d32051f8d6 100644 (file)
@@ -548,7 +548,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
             'configurations': {
                 'dhcp_driver': cfg.CONF.dhcp_driver,
                 'use_namespaces': cfg.CONF.use_namespaces,
-                'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration},
+                'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration,
+                'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats},
             'start_flag': True,
             'agent_type': constants.AGENT_TYPE_DHCP}
         report_interval = cfg.CONF.AGENT.report_interval
index a1aec14884361b92a0f7ea45d026ec3e7043dacf..03a04e7b9d2e40f0ca81118714cd0694f9f30440 100644 (file)
@@ -595,7 +595,8 @@ class L3NATAgentWithStateReport(L3NATAgent):
                 'external_network_bridge': self.conf.external_network_bridge,
                 'gateway_external_network_id':
                 self.conf.gateway_external_network_id,
-                'interface_driver': self.conf.interface_driver},
+                'interface_driver': self.conf.interface_driver,
+                'log_agent_heartbeats': self.conf.AGENT.log_agent_heartbeats},
             'start_flag': True,
             'agent_type': l3_constants.AGENT_TYPE_L3}
         report_interval = self.conf.AGENT.report_interval
index 769d8039bc043229eb2b730fb79a222f4071af08..c26626c9a819cde8a3a6f8fd10271c44650a5984 100644 (file)
@@ -289,6 +289,7 @@ class UnixDomainMetadataProxy(object):
                 'metadata_proxy_socket': cfg.CONF.metadata_proxy_socket,
                 'nova_metadata_ip': cfg.CONF.nova_metadata_ip,
                 'nova_metadata_port': cfg.CONF.nova_metadata_port,
+                'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats,
             },
             'start_flag': True,
             'agent_type': n_const.AGENT_TYPE_METADATA}
index f71d36032e8429f6c06ef136c6f50460f7f5346e..11bf79784c519417d505c0413159375135f045a9 100644 (file)
@@ -18,6 +18,7 @@ import itertools
 from oslo_log import log as logging
 import oslo_messaging
 from oslo_utils import timeutils
+from oslo_utils import uuidutils
 
 from neutron.common import constants
 from neutron.common import rpc as n_rpc
@@ -72,6 +73,11 @@ class PluginReportStateAPI(object):
 
     def report_state(self, context, agent_state, use_call=False):
         cctxt = self.client.prepare()
+        # add unique identifier to a report
+        # that can be logged on server side.
+        # This create visible correspondence between events on
+        # the agent and on the server
+        agent_state['uuid'] = uuidutils.generate_uuid()
         kwargs = {
             'agent_state': {'agent_state': agent_state},
             'time': timeutils.strtime(),
index 52dccf5c411a5884a34f5c92776e28f3c52fbe6a..702f2e497d174ca5eafa960524fccaa6cb2bda09 100644 (file)
@@ -29,7 +29,7 @@ from neutron.common import constants
 from neutron.db import model_base
 from neutron.db import models_v2
 from neutron.extensions import agent as ext_agent
-from neutron.i18n import _LE, _LW
+from neutron.i18n import _LE, _LI, _LW
 from neutron import manager
 
 LOG = logging.getLogger(__name__)
@@ -203,22 +203,33 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
         agent = self._get_agent(context, id)
         return self._make_agent_dict(agent, fields)
 
-    def _create_or_update_agent(self, context, agent):
+    def _log_heartbeat(self, state, agent_db, agent_conf):
+        if agent_conf.get('log_agent_heartbeats'):
+            delta = timeutils.utcnow() - agent_db.heartbeat_timestamp
+            LOG.info(_LI("Heartbeat received from %(type)s agent on "
+                         "host %(host)s, uuid %(uuid)s after %(delta)s"),
+                     {'type': agent_db.agent_type,
+                      'host': agent_db.host,
+                      'uuid': state.get('uuid'),
+                      'delta': delta})
+
+    def _create_or_update_agent(self, context, agent_state):
         with context.session.begin(subtransactions=True):
             res_keys = ['agent_type', 'binary', 'host', 'topic']
-            res = dict((k, agent[k]) for k in res_keys)
+            res = dict((k, agent_state[k]) for k in res_keys)
 
-            configurations_dict = agent.get('configurations', {})
+            configurations_dict = agent_state.get('configurations', {})
             res['configurations'] = jsonutils.dumps(configurations_dict)
-            res['load'] = self._get_agent_load(agent)
+            res['load'] = self._get_agent_load(agent_state)
             current_time = timeutils.utcnow()
             try:
                 agent_db = self._get_agent_by_type_and_host(
-                    context, agent['agent_type'], agent['host'])
+                    context, agent_state['agent_type'], agent_state['host'])
                 res['heartbeat_timestamp'] = current_time
-                if agent.get('start_flag'):
+                if agent_state.get('start_flag'):
                     res['started_at'] = current_time
                 greenthread.sleep(0)
+                self._log_heartbeat(agent_state, agent_db, configurations_dict)
                 agent_db.update(res)
             except ext_agent.AgentNotFoundByTypeHost:
                 greenthread.sleep(0)
@@ -229,6 +240,7 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
                 agent_db = Agent(**res)
                 greenthread.sleep(0)
                 context.session.add(agent_db)
+                self._log_heartbeat(agent_state, agent_db, configurations_dict)
             greenthread.sleep(0)
 
     def create_or_update_agent(self, context, agent):
index ca52b216257ef81c729ceab8d5a136ec2bc92b60..b9ad0dd353f2ba09981878316ec752c755fb8952 100644 (file)
@@ -200,7 +200,9 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
                                'arp_responder_enabled':
                                self.arp_responder_enabled,
                                'enable_distributed_routing':
-                               self.enable_distributed_routing},
+                               self.enable_distributed_routing,
+                               'log_agent_heartbeats':
+                               self.conf.AGENT.log_agent_heartbeats},
             'agent_type': q_const.AGENT_TYPE_OVS,
             'start_flag': True}
 
index cda8cd33b71d909519bc1e60a919f486e99fda2c..a47266314581e7d6e02169544ab99293a2c7818c 100644 (file)
@@ -127,6 +127,18 @@ class TestAgentsDbMixin(TestAgentsDbBase):
         agent = agents[0]
         self._assert_ref_fields_are_equal(self.agent_status, agent)
 
+    def test_create_or_update_agent_logs_heartbeat(self):
+        status = self.agent_status.copy()
+        status['configurations'] = {'log_agent_heartbeats': True}
+
+        with mock.patch.object(agents_db.LOG, 'info') as info:
+            self.plugin.create_or_update_agent(self.context, status)
+            self.assertTrue(info.called)
+            status['configurations'] = {'log_agent_heartbeats': False}
+            info.reset_mock()
+            self.plugin.create_or_update_agent(self.context, status)
+            self.assertFalse(info.called)
+
     def test_create_or_update_agent_concurrent_insert(self):
         # NOTE(rpodolyaka): emulate violation of the unique constraint caused
         #                   by a concurrent insert. Ensure we make another