Prevent L3 agent looping calls from hanging

author Salvatore Orlando <salv.orlando@gmail.com>

Wed, 2 Oct 2013 19:14:14 +0000 (12:14 -0700)

committer Mark McClain <mark.mcclain@dreamhost.com>

Tue, 8 Oct 2013 17:45:34 +0000 (13:45 -0400)
author Salvatore Orlando <salv.orlando@gmail.com>
Wed, 2 Oct 2013 19:14:14 +0000 (12:14 -0700)
committer Mark McClain <mark.mcclain@dreamhost.com>
Tue, 8 Oct 2013 17:45:34 +0000 (13:45 -0400)
diff --git a/neutron/agent/l3_agent.py b/neutron/agent/l3_agent.py

index d6c54fc69ffbe6de7ef3814dbe4bd545ed20cbb5..b66d441d2edc69e2c945e7052afd33dad7438a7b 100644 (file)
--- a/neutron/agent/l3_agent.py
+++ b/neutron/agent/l3_agent.py
@@ -163,10 +163,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
                     help=_("TCP Port used by Neutron metadata namespace "
                            "proxy.")),
          cfg.IntOpt('send_arp_for_ha',
-                   default=3,
-                   help=_("Send this many gratuitous ARPs for HA setup, "
-                          "set it below or equal to 0 to disable this "
-                          "feature.")),
+                   default=0,
+                   help=_("Send this many gratuitous ARPs for HA setup, if "
+                          "less than or equal to 0, the feature is disabled")),
          cfg.BoolOpt('use_namespaces', default=True,
                      help=_("Allow overlapping IP.")),
          cfg.StrOpt('router_id', default='',
@@ -308,7 +307,11 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
              self._spawn_metadata_proxy(ri)
  
      def _router_removed(self, router_id):
-        ri = self.router_info[router_id]
+        ri = self.router_info.get(router_id)
+        if ri is None:
+            LOG.warn(_("Info for router %s were not found. "
+                       "Skipping router removal"), router_id)
+            return
          ri.router['gw_port'] = None
          ri.router[l3_constants.INTERFACE_KEY] = []
          ri.router[l3_constants.FLOATINGIP_KEY] = []
@@ -706,6 +709,8 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
          # so we can clear the value of updated_routers
          # and removed_routers
          try:
+            LOG.debug(_("Starting RPC loop for %d updated routers"),
+                      len(self.updated_routers))
              if self.updated_routers:
                  router_ids = list(self.updated_routers)
                  self.updated_routers.clear()
@@ -713,6 +718,7 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
                      self.context, router_ids)
                  self._process_routers(routers)
              self._process_router_delete()
+            LOG.debug(_("RPC loop successfully completed"))
          except Exception:
              LOG.exception(_("Failed synchronizing routers"))
              self.fullsync = True
@@ -732,6 +738,8 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
      def _sync_routers_task(self, context):
          if self.services_sync:
              super(L3NATAgent, self).process_services_sync(context)
+        LOG.debug(_("Starting _sync_routers_task - fullsync:%s"),
+                  self.fullsync)
          if not self.fullsync:
              return
          try:
@@ -744,6 +752,7 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
              LOG.debug(_('Processing :%r'), routers)
              self._process_routers(routers, all_routers=True)
              self.fullsync = False
+            LOG.debug(_("_sync_routers_task successfully completed"))
          except Exception:
              LOG.exception(_("Failed synchronizing routers"))
              self.fullsync = True
@@ -809,6 +818,7 @@ class L3NATAgentWithStateReport(L3NATAgent):
              self.heartbeat.start(interval=report_interval)
  
      def _report_state(self):
+        LOG.debug(_("Report state task started"))
          num_ex_gw_ports = 0
          num_interfaces = 0
          num_floating_ips = 0
@@ -832,6 +842,7 @@ class L3NATAgentWithStateReport(L3NATAgent):
                                          self.use_call)
              self.agent_state.pop('start_flag', None)
              self.use_call = False
+            LOG.debug(_("Report state task successfully completed"))
          except AttributeError:
              # This means the server does not support report_state
              LOG.warn(_("Neutron server does not support state report."
diff --git a/neutron/agent/linux/utils.py b/neutron/agent/linux/utils.py

index fc9f4c97690f564419f6a4cf4473db747d3b8366..6e0aae41efb57536c0874cd24359eeacdd79453e 100644 (file)
--- a/neutron/agent/linux/utils.py
+++ b/neutron/agent/linux/utils.py
@@ -25,6 +25,7 @@ import struct
  import tempfile
  
  from eventlet.green import subprocess
+from eventlet import greenthread
  
  from neutron.common import utils
  from neutron.openstack.common import log as logging
@@ -43,22 +44,27 @@ def execute(cmd, root_helper=None, process_input=None, addl_env=None,
      env = os.environ.copy()
      if addl_env:
          env.update(addl_env)
-    obj = utils.subprocess_popen(cmd, shell=False,
-                                 stdin=subprocess.PIPE,
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE,
-                                 env=env)
-
-    _stdout, _stderr = (process_input and
-                        obj.communicate(process_input) or
-                        obj.communicate())
-    obj.stdin.close()
-    m = _("\nCommand: %(cmd)s\nExit code: %(code)s\nStdout: %(stdout)r\n"
-          "Stderr: %(stderr)r") % {'cmd': cmd, 'code': obj.returncode,
-                                   'stdout': _stdout, 'stderr': _stderr}
-    LOG.debug(m)
-    if obj.returncode and check_exit_code:
-        raise RuntimeError(m)
+    try:
+        obj = utils.subprocess_popen(cmd, shell=False,
+                                     stdin=subprocess.PIPE,
+                                     stdout=subprocess.PIPE,
+                                     stderr=subprocess.PIPE,
+                                     env=env)
+        _stdout, _stderr = (process_input and
+                            obj.communicate(process_input) or
+                            obj.communicate())
+        obj.stdin.close()
+        m = _("\nCommand: %(cmd)s\nExit code: %(code)s\nStdout: %(stdout)r\n"
+              "Stderr: %(stderr)r") % {'cmd': cmd, 'code': obj.returncode,
+                                       'stdout': _stdout, 'stderr': _stderr}
+        LOG.debug(m)
+        if obj.returncode and check_exit_code:
+            raise RuntimeError(m)
+    finally:
+        # NOTE(termie): this appears to be necessary to let the subprocess
+        #               call clean something up in between calls, without
+        #               it two execute calls in a row hangs the second one
+        greenthread.sleep(0)
  
      return return_stderr and (_stdout, _stderr) or _stdout
  
diff --git a/neutron/tests/unit/test_l3_agent.py b/neutron/tests/unit/test_l3_agent.py

index 3497a5ef89849b62fa622e8979b3a5c80620c630..86e43d600578b891820df5e57e14318ae37c7fda 100644 (file)
--- a/neutron/tests/unit/test_l3_agent.py
+++ b/neutron/tests/unit/test_l3_agent.py
@@ -46,6 +46,7 @@ class TestBasicRouterOperations(base.BaseTestCase):
          self.conf.set_override('router_id', 'fake_id')
          self.conf.set_override('interface_driver',
                                 'neutron.agent.linux.interface.NullDriver')
+        self.conf.set_override('send_arp_for_ha', 1)
          self.conf.root_helper = 'sudo'
  
          self.device_exists_p = mock.patch(
author	Salvatore Orlando <salv.orlando@gmail.com>
	Wed, 2 Oct 2013 19:14:14 +0000 (12:14 -0700)
committer	Mark McClain <mark.mcclain@dreamhost.com>
	Tue, 8 Oct 2013 17:45:34 +0000 (13:45 -0400)
neutron/agent/l3_agent.py		patch \| blob \| history
neutron/agent/linux/utils.py		patch \| blob \| history
neutron/tests/unit/test_l3_agent.py		patch \| blob \| history