]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Kill the vrrp orphan process when (re)spawn keepalived
authorHong Hui Xiao <xiaohhui@cn.ibm.com>
Wed, 4 Nov 2015 06:44:43 +0000 (01:44 -0500)
committerHong Hui Xiao <xiaohhui@cn.ibm.com>
Thu, 17 Dec 2015 14:58:49 +0000 (09:58 -0500)
When keepalived crashed unexpectedly, the vrrp process that
it associates with will be orphan process. This will make
the VIP unable to migrate to the router in the same host.
Also, neutron code is not able to respawn the keepalived
process, because keepalived thinks itself is still running,
according to [1-3]. As a result, neutron will report respawning
keepalived all the time. Restart l3-agent will not help.

This patch will check and delete the orphan vrrp process
if there is any, in the processmonitor of l3 agent.

More details can be found in the bug description and comments.

[1] https://goo.gl/W3GL9I
[2] https://goo.gl/F0Ixfb
[3] https://goo.gl/dUqhTo

Change-Id: Ia1759ed1365b845d404686a8cd25f882cce35caf
Closes-Bug: #1511311

neutron/agent/linux/keepalived.py
neutron/tests/functional/agent/linux/test_keepalived.py

index ba9c74e6bb8032b79c07214950eaacad12065381..96b832c7f8c2d36a2afe07eb0345ce86ce961a42 100644 (file)
@@ -383,19 +383,18 @@ class KeepalivedManager(object):
     def spawn(self):
         config_path = self._output_config_file()
 
-        def callback(pid_file):
-            cmd = ['keepalived', '-P',
-                   '-f', config_path,
-                   '-p', pid_file,
-                   '-r', '%s-vrrp' % pid_file]
-            return cmd
+        keepalived_pm = self.get_process()
+        vrrp_pm = self._get_vrrp_process(
+            '%s-vrrp' % keepalived_pm.get_pid_file_name())
 
-        pm = self.get_process(callback=callback)
-        pm.enable(reload_cfg=True)
+        keepalived_pm.default_cmd_callback = (
+            self._get_keepalived_process_callback(vrrp_pm, config_path))
+
+        keepalived_pm.enable(reload_cfg=True)
 
         self.process_monitor.register(uuid=self.resource_id,
                                       service_name=KEEPALIVED_SERVICE_NAME,
-                                      monitored_process=pm)
+                                      monitored_process=keepalived_pm)
 
         LOG.debug('Keepalived spawned with config %s', config_path)
 
@@ -413,3 +412,27 @@ class KeepalivedManager(object):
             self.namespace,
             pids_path=self.conf_path,
             default_cmd_callback=callback)
+
+    def _get_vrrp_process(self, pid_file):
+        return external_process.ProcessManager(
+            cfg.CONF,
+            self.resource_id,
+            self.namespace,
+            pid_file=pid_file)
+
+    def _get_keepalived_process_callback(self, vrrp_pm, config_path):
+
+        def callback(pid_file):
+            # If keepalived process crashed unexpectedly, the vrrp process
+            # will be orphan and prevent keepalived process to be spawned.
+            # A check here will let the l3-agent to kill the orphan process
+            # and spawn keepalived successfully.
+            if vrrp_pm.active:
+                vrrp_pm.disable()
+            cmd = ['keepalived', '-P',
+                   '-f', config_path,
+                   '-p', pid_file,
+                   '-r', '%s-vrrp' % pid_file]
+            return cmd
+
+        return callback
index d58f364cad17e9d2440fb9c8c004c3018763cbfc..47c2613cbfcb47fe6f0d05e3872c678b32a694c7 100644 (file)
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-from distutils import version
-import re
-
 from oslo_config import cfg
 from oslo_log import log as logging
-import testtools
 
 from neutron._i18n import _
 from neutron.agent.linux import external_process
@@ -31,18 +27,6 @@ from neutron.tests.unit.agent.linux import test_keepalived
 LOG = logging.getLogger(__name__)
 
 
-def keepalived_version_not_supported():
-    try:
-        cmd = ['keepalived', '--version']
-        out = utils.execute(cmd, return_stderr=True)
-        ver = re.search(r"Keepalived v(\d+\.\d+\.\d+)", out[1]).group(1)
-        return version.LooseVersion(ver) >= version.LooseVersion("1.2.11")
-    except (OSError, RuntimeError, IndexError, ValueError) as e:
-        LOG.debug("Exception while checking keepalived version. "
-                  "Exception: %s", e)
-        return False
-
-
 class KeepalivedManagerTestCase(base.BaseTestCase,
                                 test_keepalived.KeepalivedConfBaseMixin):
 
@@ -70,8 +54,7 @@ class KeepalivedManagerTestCase(base.BaseTestCase,
         self.assertEqual(self.expected_config.get_config_str(),
                          self.manager.get_conf_on_disk())
 
-    @testtools.skipIf(keepalived_version_not_supported(), 'bug/1511311')
-    def test_keepalived_respawns(self):
+    def _test_keepalived_respawns(self, normal_exit=True):
         self.manager.spawn()
         process = self.manager.get_process()
         pid = process.pid
@@ -81,11 +64,19 @@ class KeepalivedManagerTestCase(base.BaseTestCase,
             sleep=0.01,
             exception=RuntimeError(_("Keepalived didn't spawn")))
 
-        # force process crash, and see that when it comes back
-        # it's indeed a different process
-        utils.execute(['kill', '-9', pid], run_as_root=True)
+        exit_code = '-15' if normal_exit else '-9'
+
+        # Exit the process, and see that when it comes back
+        # It's indeed a different process
+        utils.execute(['kill', exit_code, pid], run_as_root=True)
         utils.wait_until_true(
             lambda: process.active and pid != process.pid,
             timeout=5,
             sleep=0.01,
             exception=RuntimeError(_("Keepalived didn't respawn")))
+
+    def test_keepalived_respawns(self):
+        self._test_keepalived_respawns()
+
+    def test_keepalived_respawn_with_unexpected_exit(self):
+        self._test_keepalived_respawns(False)