]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Add request timeout handling for Mellanox Neutron Agent
authorIrena Berezovsky <irenab@mellanox.com>
Sun, 29 Sep 2013 11:09:44 +0000 (13:09 +0200)
committerIrena Berezovsky <irenab@mellanox.com>
Sun, 1 Dec 2013 08:04:40 +0000 (10:04 +0200)
Add request timeout handling for messages sent to eswitch Daemon.
Using configurable number of retries and increasing waiting interval
between retries resend the message.
If request timeout persists, eswitch daemon is not reachable.
In such case, exit the agent process.

Closes-Bug: #1228827

Change-Id: If1290bedafe7a0dd8a61cbe328510075edeb1e5b

etc/neutron/plugins/mlnx/mlnx_conf.ini
neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
neutron/plugins/mlnx/agent/utils.py
neutron/plugins/mlnx/common/comm_utils.py [new file with mode: 0644]
neutron/plugins/mlnx/common/config.py
neutron/plugins/mlnx/common/exceptions.py
neutron/tests/unit/mlnx/test_mlnx_comm_utils.py [new file with mode: 0644]

index c3e5cc88eac930d7258d03421709cae07aa4e1a5..841947904316ca2e985506d99b9a6755495ebd2e 100644 (file)
 # vnic_type = mlnx_direct
 
 # (StrOpt) Eswitch daemon end point connection url
-# daemon_endpoint = 'tcp://127.0.0.1:5001'
+# daemon_endpoint = 'tcp://127.0.0.1:60001'
 
 # The number of milliseconds the agent will wait for
 # response on request to daemon
 # request_timeout = 3000
 
+# The number of retries the agent will send request
+# to daemon before giving up
+# retries = 3
+
+# The backoff rate multiplier for waiting period between retries
+# on request to daemon, i.e. value of 2 will double
+# the request timeout each retry
+# backoff_rate = 2
 
 [agent]
 # Agent's polling interval in seconds
index c82dd9f83b01629f66e3c5332f13bd50c919daf6..ae3ce98b67995b0fb83c4bf45f5322a9407626c6 100644 (file)
@@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
                     # If treat devices fails - must resync with plugin
                     sync = self.process_network_ports(port_info)
                     ports = port_info['current']
+            except exceptions.RequestTimeout:
+                LOG.exception(_("Request timeout in agent event loop "
+                                "eSwitchD is not responding - exiting..."))
+                raise SystemExit(1)
             except Exception:
                 LOG.exception(_("Error in agent event loop"))
                 sync = True
index de05455c0d81744f461547510c4212e23335bba2..dd4ccf0af5abb91a53842cb89c4b18fc9fd6cafc 100644 (file)
@@ -19,6 +19,7 @@ import zmq
 
 from neutron.openstack.common import jsonutils
 from neutron.openstack.common import log as logging
+from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
 from neutron.plugins.mlnx.common import exceptions
 
 LOG = logging.getLogger(__name__)
@@ -42,6 +43,7 @@ class EswitchUtils(object):
             self.poller.register(self._conn, zmq.POLLIN)
         return self.__conn
 
+    @RetryDecorator(exceptions.RequestTimeout)
     def send_msg(self, msg):
         self._conn.send(msg)
 
@@ -55,7 +57,7 @@ class EswitchUtils(object):
             self._conn.close()
             self.poller.unregister(self._conn)
             self.__conn = None
-            raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
+            raise exceptions.RequestTimeout()
 
     def parse_response_msg(self, recv_msg):
         msg = jsonutils.loads(recv_msg)
@@ -69,7 +71,7 @@ class EswitchUtils(object):
         else:
             error_msg = _("Unknown operation status %s") % msg['status']
         LOG.error(error_msg)
-        raise exceptions.MlnxException(error_msg)
+        raise exceptions.OperationFailed(err_msg=error_msg)
 
     def get_attached_vnics(self):
         LOG.debug(_("get_attached_vnics"))
diff --git a/neutron/plugins/mlnx/common/comm_utils.py b/neutron/plugins/mlnx/common/comm_utils.py
new file mode 100644 (file)
index 0000000..a1a0f4a
--- /dev/null
@@ -0,0 +1,66 @@
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+#
+# Copyright 2013 Mellanox Technologies, Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+from oslo.config import cfg
+
+from neutron.openstack.common import log as logging
+from neutron.plugins.mlnx.common import config  # noqa
+
+LOG = logging.getLogger(__name__)
+
+
+class RetryDecorator(object):
+    """Retry decorator reruns a method 'retries' times if an exception occurs.
+
+    Decorator for retrying a method if exceptionToCheck exception occurs
+    If method raises exception, retries 'retries' times with increasing
+    back off period between calls with 'interval' multiplier
+
+    :param exceptionToCheck: the exception to check
+    :param interval: initial delay between retries in seconds
+    :param retries: number of times to try before giving up
+    :raises: exceptionToCheck
+    """
+    sleep_fn = time.sleep
+
+    def __init__(self, exceptionToCheck,
+                 interval=cfg.CONF.ESWITCH.request_timeout / 1000,
+                 retries=cfg.CONF.ESWITCH.retries,
+                 backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
+        self.exc = exceptionToCheck
+        self.interval = interval
+        self.retries = retries
+        self.backoff_rate = backoff_rate
+
+    def __call__(self, original_func):
+        def decorated(*args, **kwargs):
+            sleep_interval = self.interval
+            num_of_iter = self.retries
+            while num_of_iter > 0:
+                try:
+                    return original_func(*args, **kwargs)
+                except self.exc:
+                    LOG.debug(_("Request timeout - call again after "
+                              "%s seconds"), sleep_interval)
+                    RetryDecorator.sleep_fn(sleep_interval)
+                    num_of_iter -= 1
+                    sleep_interval *= self.backoff_rate
+
+            return original_func(*args, **kwargs)
+        return decorated
index adf868ea7e61719532580b7c1a48536fb749b7e8..f5115845bf7dcd90a3eb8490e9f9d846a5739b01 100644 (file)
@@ -48,6 +48,13 @@ eswitch_opts = [
     cfg.IntOpt('request_timeout', default=3000,
                help=_("The number of milliseconds the agent will wait for "
                       "response on request to daemon.")),
+    cfg.IntOpt('retries', default=3,
+               help=_("The number of retries the agent will send request "
+                      "to daemon before giving up")),
+    cfg.IntOpt('backoff_rate', default=2,
+               help=_("backoff rate multiplier for waiting period between "
+                      "retries for request to daemon, i.e. value of 2 will "
+                      " double the request timeout each retry")),
 ]
 
 agent_opts = [
index 54355a001132d0fccd8f13672a539bb8225c88d7..6fd168215293db66e42057b41bd2839be6369467 100644 (file)
@@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc
 
 class MlnxException(qexc.NeutronException):
     message = _("Mlnx Exception: %(err_msg)s")
+
+
+class RequestTimeout(qexc.NeutronException):
+    message = _("Request Timeout: no response from eSwitchD")
+
+
+class OperationFailed(qexc.NeutronException):
+    message = _("Operation Failed: %(err_msg)s")
diff --git a/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
new file mode 100644 (file)
index 0000000..00659e0
--- /dev/null
@@ -0,0 +1,139 @@
+# Copyright (c) 2013 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mock
+from oslo.config import cfg
+
+from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
+from neutron.plugins.mlnx.common import config  # noqa
+from neutron.plugins.mlnx.common import exceptions
+from neutron.tests import base
+
+
+class WrongException(Exception):
+        pass
+
+
+class TestRetryDecorator(base.BaseTestCase):
+    def setUp(self):
+        super(TestRetryDecorator, self).setUp()
+        self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
+        self.sleep_fn = self.sleep_fn_p.start()
+        self.addCleanup(self.sleep_fn_p.stop)
+
+    def test_no_retry_required(self):
+        self.counter = 0
+
+        @RetryDecorator(exceptions.RequestTimeout, interval=2,
+                        retries=3, backoff_rate=2)
+        def succeeds():
+            self.counter += 1
+            return 'success'
+
+        ret = succeeds()
+        self.assertFalse(self.sleep_fn.called)
+        self.assertEqual(ret, 'success')
+        self.assertEqual(self.counter, 1)
+
+    def test_retry_zero_times(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 0
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, 1)
+        self.assertFalse(self.sleep_fn.called)
+
+    def test_retries_once(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 3
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def fails_once():
+            self.counter += 1
+            if self.counter < 2:
+                raise exceptions.RequestTimeout()
+            else:
+                return 'success'
+
+        ret = fails_once()
+        self.assertEqual(ret, 'success')
+        self.assertEqual(self.counter, 2)
+        self.assertEqual(self.sleep_fn.call_count, 1)
+        self.sleep_fn.assert_called_with(interval)
+
+    def test_limit_is_reached(self):
+        self.counter = 0
+        retries = 3
+        interval = 2
+        backoff_rate = 4
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, retries + 1)
+        self.assertEqual(self.sleep_fn.call_count, retries)
+
+        expected_sleep_fn_arg = []
+        for i in range(retries):
+            expected_sleep_fn_arg.append(interval)
+            interval *= backoff_rate
+
+        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
+
+    def test_limit_is_reached_with_conf(self):
+        self.counter = 0
+
+        @RetryDecorator(exceptions.RequestTimeout)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        retry = cfg.CONF.ESWITCH.retries
+        interval = cfg.CONF.ESWITCH.request_timeout / 1000
+        delay_rate = cfg.CONF.ESWITCH.backoff_rate
+
+        expected_sleep_fn_arg = []
+        for i in range(retry):
+            expected_sleep_fn_arg.append(interval)
+            interval *= delay_rate
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, retry + 1)
+        self.assertEqual(self.sleep_fn.call_count, retry)
+        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
+
+    def test_wrong_exception_no_retry(self):
+
+        @RetryDecorator(exceptions.RequestTimeout)
+        def raise_unexpected_error():
+            raise WrongException("wrong exception")
+
+        self.assertRaises(WrongException, raise_unexpected_error)
+        self.assertFalse(self.sleep_fn.called)