From bb7cce32f3ca5ccf903e13d37c2c62a0222090da Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Wed, 15 Oct 2014 20:56:17 -0500 Subject: [PATCH] Only resync DHCP for a particular network when their is a failure The previous implementation will loop through and restart the dhcp process for all active networks any time there is an exception calling a dhcp driver function. This allows a tenant who can create an exception to cause every dhcp process to restart. On systems with lots of networks this can easily take longer than the default resync timeout leading to a system that becomes unresponsive because of the load continually restarting causes. This patch restarts only dhcp processes related to the network on which operations are failing. It should be noted that if there was some kind of missed notification for a subnet update, the previous implementation may have incidentally fixed it by restarting everything on the off chance that something else caused an exception, but obviously relying on that would be a bad idea as exceptions should be, well, exceptional. Closes-bug: #1384402 Change-Id: I0b348a1657a7eb3a595f9bf6b217716a37ce38c6 --- neutron/agent/dhcp_agent.py | 41 +++++++++++++++++---------- neutron/tests/unit/test_dhcp_agent.py | 4 +-- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/neutron/agent/dhcp_agent.py b/neutron/agent/dhcp_agent.py index 5a501faa9..cd3d8f4d0 100644 --- a/neutron/agent/dhcp_agent.py +++ b/neutron/agent/dhcp_agent.py @@ -13,6 +13,7 @@ # License for the specific language governing permissions and limitations # under the License. +import collections import os import sys @@ -68,7 +69,7 @@ class DhcpAgent(manager.Manager): def __init__(self, host=None): super(DhcpAgent, self).__init__(host=host) - self.needs_resync_reasons = [] + self.needs_resync_reasons = collections.defaultdict(list) self.conf = cfg.CONF self.cache = NetworkCache() self.root_helper = config.get_root_helper(self.conf) @@ -136,7 +137,7 @@ class DhcpAgent(manager.Manager): 'that the network and/or its subnet(s) still exist.') % {'net_id': network.id, 'action': action}) except Exception as e: - self.schedule_resync(e) + self.schedule_resync(e, network.id) if (isinstance(e, n_rpc.RemoteError) and e.exc_type == 'NetworkNotFound' or isinstance(e, exceptions.NetworkNotFound)): @@ -145,13 +146,18 @@ class DhcpAgent(manager.Manager): LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.') % {'net_id': network.id, 'action': action}) - def schedule_resync(self, reason): - """Schedule a resync for a given reason.""" - self.needs_resync_reasons.append(reason) + def schedule_resync(self, reason, network=None): + """Schedule a resync for a given network and reason. If no network is + specified, resync all networks. + """ + self.needs_resync_reasons[network].append(reason) @utils.synchronized('dhcp-agent') - def sync_state(self): - """Sync the local DHCP state with Neutron.""" + def sync_state(self, networks=None): + """Sync the local DHCP state with Neutron. If no networks are passed, + or 'None' is one of the networks, sync all of the networks. + """ + only_nets = set([] if (not networks or None in networks) else networks) LOG.info(_('Synchronizing state')) pool = eventlet.GreenPool(cfg.CONF.num_sync_threads) known_network_ids = set(self.cache.get_network_ids()) @@ -163,12 +169,15 @@ class DhcpAgent(manager.Manager): try: self.disable_dhcp_helper(deleted_id) except Exception as e: - self.schedule_resync(e) + self.schedule_resync(e, deleted_id) LOG.exception(_('Unable to sync network state on deleted ' 'network %s'), deleted_id) for network in active_networks: - pool.spawn(self.safe_configure_dhcp_for_network, network) + if (not only_nets or # specifically resync all + network.id not in known_network_ids or # missing net + network.id in only_nets): # specific network to sync + pool.spawn(self.safe_configure_dhcp_for_network, network) pool.waitall() LOG.info(_('Synchronizing state complete')) @@ -185,11 +194,13 @@ class DhcpAgent(manager.Manager): # be careful to avoid a race with additions to list # from other threads reasons = self.needs_resync_reasons - self.needs_resync_reasons = [] - for r in reasons: - LOG.debug(_("resync: %(reason)s"), - {"reason": r}) - self.sync_state() + self.needs_resync_reasons = collections.defaultdict(list) + for net, r in reasons.items(): + if not net: + net = "*" + LOG.debug(_("resync (%(network)s): %(reason)s"), + {"reason": r, "network": net}) + self.sync_state(reasons.keys()) def periodic_resync(self): """Spawn a thread to periodically resync the dhcp state.""" @@ -202,7 +213,7 @@ class DhcpAgent(manager.Manager): LOG.warn(_('Network %s has been deleted.'), network_id) return network except Exception as e: - self.schedule_resync(e) + self.schedule_resync(e, network_id) LOG.exception(_('Network %s info call failed.'), network_id) def enable_dhcp_helper(self, network_id): diff --git a/neutron/tests/unit/test_dhcp_agent.py b/neutron/tests/unit/test_dhcp_agent.py index c6f98643f..009075ec2 100644 --- a/neutron/tests/unit/test_dhcp_agent.py +++ b/neutron/tests/unit/test_dhcp_agent.py @@ -333,12 +333,12 @@ class TestDhcpAgent(base.BaseTestCase): def test_periodoc_resync_helper(self): with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep: dhcp = dhcp_agent.DhcpAgent(HOSTNAME) - dhcp.needs_resync_reasons = ['reason1', 'reason2'] + dhcp.needs_resync_reasons = {'a': 'reason1', 'b': 'reason2'} with mock.patch.object(dhcp, 'sync_state') as sync_state: sync_state.side_effect = RuntimeError with testtools.ExpectedException(RuntimeError): dhcp._periodic_resync_helper() - sync_state.assert_called_once_with() + sync_state.assert_called_once_with(['a', 'b']) sleep.assert_called_once_with(dhcp.conf.resync_interval) self.assertEqual(len(dhcp.needs_resync_reasons), 0) -- 2.45.2