From 133a399d4f87a20cb06b6bae604a3666f36ebd95 Mon Sep 17 00:00:00 2001 From: Mike Kolesnik Date: Tue, 3 Feb 2015 10:58:47 +0200 Subject: [PATCH] Add proccess monitor to keepalived Adding process monitor to keepalived code, so that keepalived processes launched by the L3 agent will get monitored the same as other processes launched by the L3 agent. Old monitoring code was removed since it's not needed anymore. Implements: blueprint agent-child-processes-status Change-Id: I94a889ee07286ab3c6cdab9ab15e5aee6fbd133a --- neutron/agent/l3/ha.py | 12 +--- neutron/agent/l3/ha_router.py | 22 +++---- neutron/agent/linux/keepalived.py | 59 ++++++------------- .../functional/agent/linux/test_keepalived.py | 38 +++++++++--- .../agent/linux/test_process_monitor.py | 1 + .../tests/functional/agent/test_l3_agent.py | 4 +- 6 files changed, 60 insertions(+), 76 deletions(-) diff --git a/neutron/agent/l3/ha.py b/neutron/agent/l3/ha.py index dc34b93d1..5758abba0 100644 --- a/neutron/agent/l3/ha.py +++ b/neutron/agent/l3/ha.py @@ -21,7 +21,6 @@ from neutron.agent.linux import keepalived from neutron.common import constants as l3_constants from neutron.i18n import _LE from neutron.openstack.common import log as logging -from neutron.openstack.common import periodic_task LOG = logging.getLogger(__name__) @@ -69,17 +68,8 @@ class AgentMixin(object): ha_port['mac_address']) ri.ha_port = ha_port - ri._init_keepalived_manager() + ri._init_keepalived_manager(self.process_monitor) ri._add_keepalived_notifiers() def process_ha_router_removed(self, ri): ri.ha_network_removed() - - def get_ha_routers(self): - return (router for router in self.router_info.values() if router.is_ha) - - @periodic_task.periodic_task - def _ensure_keepalived_alive(self, context): - # TODO(amuller): Use external_process.ProcessMonitor - for router in self.get_ha_routers(): - router.keepalived_manager.revive() diff --git a/neutron/agent/l3/ha_router.py b/neutron/agent/l3/ha_router.py index b12813314..014746354 100644 --- a/neutron/agent/l3/ha_router.py +++ b/neutron/agent/l3/ha_router.py @@ -68,16 +68,13 @@ class HaRouter(router.RouterInfo): LOG.debug('Error while reading HA state for %s', self.router_id) return None - def get_keepalived_manager(self): - return keepalived.KeepalivedManager( + def _init_keepalived_manager(self, process_monitor): + self.keepalived_manager = keepalived.KeepalivedManager( self.router['id'], keepalived.KeepalivedConf(), conf_path=self.agent_conf.ha_confs_path, - namespace=self.ns_name) - - def _init_keepalived_manager(self): - # TODO(Carl) This looks a bit funny, doesn't it? - self.keepalived_manager = self.get_keepalived_manager() + namespace=self.ns_name, + process_monitor=process_monitor) config = self.keepalived_manager.config @@ -102,7 +99,7 @@ class HaRouter(router.RouterInfo): config.add_instance(instance) def spawn_keepalived(self): - self.keepalived_manager.spawn_or_restart() + self.keepalived_manager.spawn() def disable_keepalived(self): self.keepalived_manager.disable() @@ -213,13 +210,8 @@ class HaRouter(router.RouterInfo): it manage IPv4 addresses. In order to do that, we must delete the address first as it is autoconfigured by the kernel. """ - process = keepalived.KeepalivedManager.get_process( - self.agent_conf, - self.router_id, - self.ns_name, - self.agent_conf.ha_confs_path) - if process.active: - manager = self.get_keepalived_manager() + manager = self.keepalived_manager + if manager.get_process().active: conf = manager.get_conf_on_disk() managed_by_keepalived = conf and ipv6_lladdr in conf if managed_by_keepalived: diff --git a/neutron/agent/linux/keepalived.py b/neutron/agent/linux/keepalived.py index 028681555..c448f41ae 100644 --- a/neutron/agent/linux/keepalived.py +++ b/neutron/agent/linux/keepalived.py @@ -23,7 +23,6 @@ from oslo_config import cfg from neutron.agent.linux import external_process from neutron.agent.linux import utils from neutron.common import exceptions -from neutron.i18n import _LW from neutron.openstack.common import log as logging VALID_STATES = ['MASTER', 'BACKUP'] @@ -33,6 +32,7 @@ HA_DEFAULT_PRIORITY = 50 PRIMARY_VIP_RANGE_SIZE = 24 # TODO(amuller): Use L3 agent constant when new constants module is introduced. FIP_LL_SUBNET = '169.254.30.0/23' +KEEPALIVED_SERVICE_NAME = 'keepalived' LOG = logging.getLogger(__name__) @@ -365,14 +365,13 @@ class KeepalivedManager(KeepalivedNotifierMixin): """ def __init__(self, resource_id, config, conf_path='/tmp', - namespace=None): + namespace=None, process_monitor=None): self.resource_id = resource_id self.config = config self.namespace = namespace + self.process_monitor = process_monitor self.conf_path = conf_path - self.conf = cfg.CONF self.process = None - self.spawned = False def _output_config_file(self): config_str = self.config.get_config_str() @@ -393,11 +392,6 @@ class KeepalivedManager(KeepalivedNotifierMixin): def spawn(self): config_path = self._output_config_file() - self.process = self.get_process(self.conf, - self.resource_id, - self.namespace, - self.conf_path) - def callback(pid_file): cmd = ['keepalived', '-P', '-f', config_path, @@ -405,41 +399,26 @@ class KeepalivedManager(KeepalivedNotifierMixin): '-r', '%s-vrrp' % pid_file] return cmd - self.process.enable(callback, reload_cfg=True) + pm = self.get_process(callback=callback) + pm.enable(reload_cfg=True) - self.spawned = True - LOG.debug('Keepalived spawned with config %s', config_path) + self.process_monitor.register(uuid=self.resource_id, + service_name=KEEPALIVED_SERVICE_NAME, + monitored_process=pm) - def spawn_or_restart(self): - if self.process: - self.restart() - else: - self.spawn() - - def restart(self): - if self.process.active: - self._output_config_file() - self.process.reload_cfg() - else: - LOG.warn(_LW('A previous instance of keepalived seems to be dead, ' - 'unable to restart it, a new instance will be ' - 'spawned')) - self.process.disable() - self.spawn() + LOG.debug('Keepalived spawned with config %s', config_path) def disable(self): - if self.process: - self.process.disable(sig='15') - self.spawned = False + self.process_monitor.unregister(uuid=self.resource_id, + service_name=KEEPALIVED_SERVICE_NAME) - def revive(self): - if self.spawned and not self.process.active: - self.restart() + pm = self.get_process() + pm.disable(sig='15') - @classmethod - def get_process(cls, conf, resource_id, namespace, conf_path): + def get_process(self, callback=None): return external_process.ProcessManager( - conf, - resource_id, - namespace, - pids_path=conf_path) + cfg.CONF, + self.resource_id, + self.namespace, + pids_path=self.conf_path, + default_cmd_callback=callback) diff --git a/neutron/tests/functional/agent/linux/test_keepalived.py b/neutron/tests/functional/agent/linux/test_keepalived.py index 656905eea..9e1060ea0 100644 --- a/neutron/tests/functional/agent/linux/test_keepalived.py +++ b/neutron/tests/functional/agent/linux/test_keepalived.py @@ -17,6 +17,7 @@ from oslo_config import cfg from neutron.agent.linux import external_process from neutron.agent.linux import keepalived +from neutron.agent.linux import utils from neutron.tests import base from neutron.tests.unit.agent.linux import test_keepalived @@ -24,13 +25,21 @@ from neutron.tests.unit.agent.linux import test_keepalived class KeepalivedManagerTestCase(base.BaseTestCase, test_keepalived.KeepalivedConfBaseMixin): - def test_keepalived_spawn(self): - expected_config = self._get_config() - manager = keepalived.KeepalivedManager('router1', expected_config, - conf_path=cfg.CONF.state_path) - self.addCleanup(manager.disable) + def setUp(self): + super(KeepalivedManagerTestCase, self).setUp() + cfg.CONF.set_override('check_child_processes_interval', 1, 'AGENT') + + self.expected_config = self._get_config() + self.process_monitor = external_process.ProcessMonitor(cfg.CONF, + 'router') + self.manager = keepalived.KeepalivedManager( + 'router1', self.expected_config, conf_path=cfg.CONF.state_path, + process_monitor=self.process_monitor) + self.addCleanup(self.manager.get_process().disable) + self.addCleanup(self.process_monitor.stop) - manager.spawn() + def test_keepalived_spawn(self): + self.manager.spawn() process = external_process.ProcessManager( cfg.CONF, 'router1', @@ -38,5 +47,18 @@ class KeepalivedManagerTestCase(base.BaseTestCase, pids_path=cfg.CONF.state_path) self.assertTrue(process.active) - self.assertEqual(expected_config.get_config_str(), - manager.get_conf_on_disk()) + self.assertEqual(self.expected_config.get_config_str(), + self.manager.get_conf_on_disk()) + + def test_keepalived_respawns(self): + self.manager.spawn() + process = self.manager.get_process() + self.assertTrue(process.active) + + process.disable(sig='15') + + utils.wait_until_true( + lambda: process.active, + timeout=5, + sleep=0.01, + exception=RuntimeError(_("Keepalived didn't respawn"))) diff --git a/neutron/tests/functional/agent/linux/test_process_monitor.py b/neutron/tests/functional/agent/linux/test_process_monitor.py index aca151dd6..609c79cae 100644 --- a/neutron/tests/functional/agent/linux/test_process_monitor.py +++ b/neutron/tests/functional/agent/linux/test_process_monitor.py @@ -34,6 +34,7 @@ class BaseTestProcessMonitor(base.BaseTestCase): self._process_monitor = None self.create_child_processes_manager('respawn') self.addCleanup(self.cleanup_spawned_children) + self.addCleanup(self._process_monitor.stop) def create_child_processes_manager(self, action): cfg.CONF.set_override('check_child_processes_action', action, 'AGENT') diff --git a/neutron/tests/functional/agent/test_l3_agent.py b/neutron/tests/functional/agent/test_l3_agent.py index 940091664..f8569370b 100755 --- a/neutron/tests/functional/agent/test_l3_agent.py +++ b/neutron/tests/functional/agent/test_l3_agent.py @@ -428,14 +428,14 @@ class L3AgentTestCase(L3AgentTestFramework): if enable_ha: self._assert_ha_device(router) - self.assertTrue(router.keepalived_manager.process.active) + self.assertTrue(router.keepalived_manager.get_process().active) self._delete_router(self.agent, router.router_id) self._assert_interfaces_deleted_from_ovs() self._assert_router_does_not_exist(router) if enable_ha: - self.assertFalse(router.keepalived_manager.process.active) + self.assertFalse(router.keepalived_manager.get_process().active) def _assert_external_device(self, router): external_port = router.get_ex_gw_port() -- 2.45.2