Add request timeout handling for Mellanox Neutron Agent

author Irena Berezovsky <irenab@mellanox.com>

Sun, 29 Sep 2013 11:09:44 +0000 (13:09 +0200)

committer Irena Berezovsky <irenab@mellanox.com>

Sun, 1 Dec 2013 08:04:40 +0000 (10:04 +0200)
author Irena Berezovsky <irenab@mellanox.com>
Sun, 29 Sep 2013 11:09:44 +0000 (13:09 +0200)
committer Irena Berezovsky <irenab@mellanox.com>
Sun, 1 Dec 2013 08:04:40 +0000 (10:04 +0200)
diff --git a/etc/neutron/plugins/mlnx/mlnx_conf.ini b/etc/neutron/plugins/mlnx/mlnx_conf.ini

index c3e5cc88eac930d7258d03421709cae07aa4e1a5..841947904316ca2e985506d99b9a6755495ebd2e 100644 (file)
--- a/etc/neutron/plugins/mlnx/mlnx_conf.ini
+++ b/etc/neutron/plugins/mlnx/mlnx_conf.ini
@@ -34,12 +34,20 @@
  # vnic_type = mlnx_direct
  
  # (StrOpt) Eswitch daemon end point connection url
-# daemon_endpoint = 'tcp://127.0.0.1:5001'
+# daemon_endpoint = 'tcp://127.0.0.1:60001'
  
  # The number of milliseconds the agent will wait for
  # response on request to daemon
  # request_timeout = 3000
  
+# The number of retries the agent will send request
+# to daemon before giving up
+# retries = 3
+
+# The backoff rate multiplier for waiting period between retries
+# on request to daemon, i.e. value of 2 will double
+# the request timeout each retry
+# backoff_rate = 2
  
  [agent]
  # Agent's polling interval in seconds
diff --git a/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py b/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py

index c82dd9f83b01629f66e3c5332f13bd50c919daf6..ae3ce98b67995b0fb83c4bf45f5322a9407626c6 100644 (file)
--- a/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
+++ b/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
@@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
                      # If treat devices fails - must resync with plugin
                      sync = self.process_network_ports(port_info)
                      ports = port_info['current']
+            except exceptions.RequestTimeout:
+                LOG.exception(_("Request timeout in agent event loop "
+                                "eSwitchD is not responding - exiting..."))
+                raise SystemExit(1)
              except Exception:
                  LOG.exception(_("Error in agent event loop"))
                  sync = True
diff --git a/neutron/plugins/mlnx/agent/utils.py b/neutron/plugins/mlnx/agent/utils.py

index de05455c0d81744f461547510c4212e23335bba2..dd4ccf0af5abb91a53842cb89c4b18fc9fd6cafc 100644 (file)
--- a/neutron/plugins/mlnx/agent/utils.py
+++ b/neutron/plugins/mlnx/agent/utils.py
@@ -19,6 +19,7 @@ import zmq
  
  from neutron.openstack.common import jsonutils
  from neutron.openstack.common import log as logging
+from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
  from neutron.plugins.mlnx.common import exceptions
  
  LOG = logging.getLogger(__name__)
@@ -42,6 +43,7 @@ class EswitchUtils(object):
              self.poller.register(self._conn, zmq.POLLIN)
          return self.__conn
  
+    @RetryDecorator(exceptions.RequestTimeout)
      def send_msg(self, msg):
          self._conn.send(msg)
  
@@ -55,7 +57,7 @@ class EswitchUtils(object):
              self._conn.close()
              self.poller.unregister(self._conn)
              self.__conn = None
-            raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
+            raise exceptions.RequestTimeout()
  
      def parse_response_msg(self, recv_msg):
          msg = jsonutils.loads(recv_msg)
@@ -69,7 +71,7 @@ class EswitchUtils(object):
          else:
              error_msg = _("Unknown operation status %s") % msg['status']
          LOG.error(error_msg)
-        raise exceptions.MlnxException(error_msg)
+        raise exceptions.OperationFailed(err_msg=error_msg)
  
      def get_attached_vnics(self):
          LOG.debug(_("get_attached_vnics"))
diff --git a/neutron/plugins/mlnx/common/comm_utils.py b/neutron/plugins/mlnx/common/comm_utils.py

new file mode 100644 (file)

index 0000000..a1a0f4a
--- /dev/null
+++ b/neutron/plugins/mlnx/common/comm_utils.py
@@ -0,0 +1,66 @@
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+#
+# Copyright 2013 Mellanox Technologies, Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+from oslo.config import cfg
+
+from neutron.openstack.common import log as logging
+from neutron.plugins.mlnx.common import config  # noqa
+
+LOG = logging.getLogger(__name__)
+
+
+class RetryDecorator(object):
+    """Retry decorator reruns a method 'retries' times if an exception occurs.
+
+    Decorator for retrying a method if exceptionToCheck exception occurs
+    If method raises exception, retries 'retries' times with increasing
+    back off period between calls with 'interval' multiplier
+
+    :param exceptionToCheck: the exception to check
+    :param interval: initial delay between retries in seconds
+    :param retries: number of times to try before giving up
+    :raises: exceptionToCheck
+    """
+    sleep_fn = time.sleep
+
+    def __init__(self, exceptionToCheck,
+                 interval=cfg.CONF.ESWITCH.request_timeout / 1000,
+                 retries=cfg.CONF.ESWITCH.retries,
+                 backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
+        self.exc = exceptionToCheck
+        self.interval = interval
+        self.retries = retries
+        self.backoff_rate = backoff_rate
+
+    def __call__(self, original_func):
+        def decorated(*args, **kwargs):
+            sleep_interval = self.interval
+            num_of_iter = self.retries
+            while num_of_iter > 0:
+                try:
+                    return original_func(*args, **kwargs)
+                except self.exc:
+                    LOG.debug(_("Request timeout - call again after "
+                              "%s seconds"), sleep_interval)
+                    RetryDecorator.sleep_fn(sleep_interval)
+                    num_of_iter -= 1
+                    sleep_interval *= self.backoff_rate
+
+            return original_func(*args, **kwargs)
+        return decorated
diff --git a/neutron/plugins/mlnx/common/config.py b/neutron/plugins/mlnx/common/config.py

index adf868ea7e61719532580b7c1a48536fb749b7e8..f5115845bf7dcd90a3eb8490e9f9d846a5739b01 100644 (file)
--- a/neutron/plugins/mlnx/common/config.py
+++ b/neutron/plugins/mlnx/common/config.py
@@ -48,6 +48,13 @@ eswitch_opts = [
      cfg.IntOpt('request_timeout', default=3000,
                 help=_("The number of milliseconds the agent will wait for "
                        "response on request to daemon.")),
+    cfg.IntOpt('retries', default=3,
+               help=_("The number of retries the agent will send request "
+                      "to daemon before giving up")),
+    cfg.IntOpt('backoff_rate', default=2,
+               help=_("backoff rate multiplier for waiting period between "
+                      "retries for request to daemon, i.e. value of 2 will "
+                      " double the request timeout each retry")),
  ]
  
  agent_opts = [
diff --git a/neutron/plugins/mlnx/common/exceptions.py b/neutron/plugins/mlnx/common/exceptions.py

index 54355a001132d0fccd8f13672a539bb8225c88d7..6fd168215293db66e42057b41bd2839be6369467 100644 (file)
--- a/neutron/plugins/mlnx/common/exceptions.py
+++ b/neutron/plugins/mlnx/common/exceptions.py
@@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc
  
  class MlnxException(qexc.NeutronException):
      message = _("Mlnx Exception: %(err_msg)s")
+
+
+class RequestTimeout(qexc.NeutronException):
+    message = _("Request Timeout: no response from eSwitchD")
+
+
+class OperationFailed(qexc.NeutronException):
+    message = _("Operation Failed: %(err_msg)s")
diff --git a/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py

new file mode 100644 (file)

index 0000000..00659e0
--- /dev/null
+++ b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2013 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mock
+from oslo.config import cfg
+
+from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
+from neutron.plugins.mlnx.common import config  # noqa
+from neutron.plugins.mlnx.common import exceptions
+from neutron.tests import base
+
+
+class WrongException(Exception):
+        pass
+
+
+class TestRetryDecorator(base.BaseTestCase):
+    def setUp(self):
+        super(TestRetryDecorator, self).setUp()
+        self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
+        self.sleep_fn = self.sleep_fn_p.start()
+        self.addCleanup(self.sleep_fn_p.stop)
+
+    def test_no_retry_required(self):
+        self.counter = 0
+
+        @RetryDecorator(exceptions.RequestTimeout, interval=2,
+                        retries=3, backoff_rate=2)
+        def succeeds():
+            self.counter += 1
+            return 'success'
+
+        ret = succeeds()
+        self.assertFalse(self.sleep_fn.called)
+        self.assertEqual(ret, 'success')
+        self.assertEqual(self.counter, 1)
+
+    def test_retry_zero_times(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 0
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, 1)
+        self.assertFalse(self.sleep_fn.called)
+
+    def test_retries_once(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 3
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def fails_once():
+            self.counter += 1
+            if self.counter < 2:
+                raise exceptions.RequestTimeout()
+            else:
+                return 'success'
+
+        ret = fails_once()
+        self.assertEqual(ret, 'success')
+        self.assertEqual(self.counter, 2)
+        self.assertEqual(self.sleep_fn.call_count, 1)
+        self.sleep_fn.assert_called_with(interval)
+
+    def test_limit_is_reached(self):
+        self.counter = 0
+        retries = 3
+        interval = 2
+        backoff_rate = 4
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, retries + 1)
+        self.assertEqual(self.sleep_fn.call_count, retries)
+
+        expected_sleep_fn_arg = []
+        for i in range(retries):
+            expected_sleep_fn_arg.append(interval)
+            interval *= backoff_rate
+
+        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
+
+    def test_limit_is_reached_with_conf(self):
+        self.counter = 0
+
+        @RetryDecorator(exceptions.RequestTimeout)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        retry = cfg.CONF.ESWITCH.retries
+        interval = cfg.CONF.ESWITCH.request_timeout / 1000
+        delay_rate = cfg.CONF.ESWITCH.backoff_rate
+
+        expected_sleep_fn_arg = []
+        for i in range(retry):
+            expected_sleep_fn_arg.append(interval)
+            interval *= delay_rate
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, retry + 1)
+        self.assertEqual(self.sleep_fn.call_count, retry)
+        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
+
+    def test_wrong_exception_no_retry(self):
+
+        @RetryDecorator(exceptions.RequestTimeout)
+        def raise_unexpected_error():
+            raise WrongException("wrong exception")
+
+        self.assertRaises(WrongException, raise_unexpected_error)
+        self.assertFalse(self.sleep_fn.called)
author	Irena Berezovsky <irenab@mellanox.com>
	Sun, 29 Sep 2013 11:09:44 +0000 (13:09 +0200)
committer	Irena Berezovsky <irenab@mellanox.com>
	Sun, 1 Dec 2013 08:04:40 +0000 (10:04 +0200)
etc/neutron/plugins/mlnx/mlnx_conf.ini		patch \| blob \| history
neutron/plugins/mlnx/agent/eswitch_neutron_agent.py		patch \| blob \| history
neutron/plugins/mlnx/agent/utils.py		patch \| blob \| history
neutron/plugins/mlnx/common/comm_utils.py	[new file with mode: 0644]	patch \| blob
neutron/plugins/mlnx/common/config.py		patch \| blob \| history
neutron/plugins/mlnx/common/exceptions.py		patch \| blob \| history
neutron/tests/unit/mlnx/test_mlnx_comm_utils.py	[new file with mode: 0644]	patch \| blob