]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Only resync DHCP for a particular network when their is a failure
authorTerry Wilson <twilson@redhat.com>
Thu, 16 Oct 2014 01:56:17 +0000 (20:56 -0500)
committerTerry Wilson <twilson@redhat.com>
Wed, 22 Oct 2014 19:23:03 +0000 (19:23 +0000)
The previous implementation will loop through and restart the dhcp
process for all active networks any time there is an exception calling
a dhcp driver function. This allows a tenant who can create an exception
to cause every dhcp process to restart. On systems with lots of networks
this can easily take longer than the default resync timeout leading to a
system that becomes unresponsive because of the load continually restarting
causes.

This patch restarts only dhcp processes related to the network on which
operations are failing. It should be noted that if there was some kind
of missed notification for a subnet update, the previous implementation
may have incidentally fixed it by restarting everything on the off
chance that something else caused an exception, but obviously relying
on that would be a bad idea as exceptions should be, well, exceptional.

Closes-bug: #1384402

Change-Id: I0b348a1657a7eb3a595f9bf6b217716a37ce38c6

neutron/agent/dhcp_agent.py
neutron/tests/unit/test_dhcp_agent.py

index 5a501faa91cba10606ea1d86941b94a23abbf33c..cd3d8f4d090132119bf9cb18832e29b4625bbb2a 100644 (file)
@@ -13,6 +13,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+import collections
 import os
 import sys
 
@@ -68,7 +69,7 @@ class DhcpAgent(manager.Manager):
 
     def __init__(self, host=None):
         super(DhcpAgent, self).__init__(host=host)
-        self.needs_resync_reasons = []
+        self.needs_resync_reasons = collections.defaultdict(list)
         self.conf = cfg.CONF
         self.cache = NetworkCache()
         self.root_helper = config.get_root_helper(self.conf)
@@ -136,7 +137,7 @@ class DhcpAgent(manager.Manager):
                           'that the network and/or its subnet(s) still exist.')
                         % {'net_id': network.id, 'action': action})
         except Exception as e:
-            self.schedule_resync(e)
+            self.schedule_resync(e, network.id)
             if (isinstance(e, n_rpc.RemoteError)
                 and e.exc_type == 'NetworkNotFound'
                 or isinstance(e, exceptions.NetworkNotFound)):
@@ -145,13 +146,18 @@ class DhcpAgent(manager.Manager):
                 LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.')
                               % {'net_id': network.id, 'action': action})
 
-    def schedule_resync(self, reason):
-        """Schedule a resync for a given reason."""
-        self.needs_resync_reasons.append(reason)
+    def schedule_resync(self, reason, network=None):
+        """Schedule a resync for a given network and reason. If no network is
+        specified, resync all networks.
+        """
+        self.needs_resync_reasons[network].append(reason)
 
     @utils.synchronized('dhcp-agent')
-    def sync_state(self):
-        """Sync the local DHCP state with Neutron."""
+    def sync_state(self, networks=None):
+        """Sync the local DHCP state with Neutron. If no networks are passed,
+        or 'None' is one of the networks, sync all of the networks.
+        """
+        only_nets = set([] if (not networks or None in networks) else networks)
         LOG.info(_('Synchronizing state'))
         pool = eventlet.GreenPool(cfg.CONF.num_sync_threads)
         known_network_ids = set(self.cache.get_network_ids())
@@ -163,12 +169,15 @@ class DhcpAgent(manager.Manager):
                 try:
                     self.disable_dhcp_helper(deleted_id)
                 except Exception as e:
-                    self.schedule_resync(e)
+                    self.schedule_resync(e, deleted_id)
                     LOG.exception(_('Unable to sync network state on deleted '
                                     'network %s'), deleted_id)
 
             for network in active_networks:
-                pool.spawn(self.safe_configure_dhcp_for_network, network)
+                if (not only_nets or  # specifically resync all
+                        network.id not in known_network_ids or  # missing net
+                        network.id in only_nets):  # specific network to sync
+                    pool.spawn(self.safe_configure_dhcp_for_network, network)
             pool.waitall()
             LOG.info(_('Synchronizing state complete'))
 
@@ -185,11 +194,13 @@ class DhcpAgent(manager.Manager):
                 # be careful to avoid a race with additions to list
                 # from other threads
                 reasons = self.needs_resync_reasons
-                self.needs_resync_reasons = []
-                for r in reasons:
-                    LOG.debug(_("resync: %(reason)s"),
-                              {"reason": r})
-                self.sync_state()
+                self.needs_resync_reasons = collections.defaultdict(list)
+                for net, r in reasons.items():
+                    if not net:
+                        net = "*"
+                    LOG.debug(_("resync (%(network)s): %(reason)s"),
+                              {"reason": r, "network": net})
+                self.sync_state(reasons.keys())
 
     def periodic_resync(self):
         """Spawn a thread to periodically resync the dhcp state."""
@@ -202,7 +213,7 @@ class DhcpAgent(manager.Manager):
                 LOG.warn(_('Network %s has been deleted.'), network_id)
             return network
         except Exception as e:
-            self.schedule_resync(e)
+            self.schedule_resync(e, network_id)
             LOG.exception(_('Network %s info call failed.'), network_id)
 
     def enable_dhcp_helper(self, network_id):
index c6f98643fe8e309f1fbb20bc2aa4e0e90d1c4ea0..009075ec2eec32271b7bc9102c868673b77e7ef9 100644 (file)
@@ -333,12 +333,12 @@ class TestDhcpAgent(base.BaseTestCase):
     def test_periodoc_resync_helper(self):
         with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
             dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
-            dhcp.needs_resync_reasons = ['reason1', 'reason2']
+            dhcp.needs_resync_reasons = {'a': 'reason1', 'b': 'reason2'}
             with mock.patch.object(dhcp, 'sync_state') as sync_state:
                 sync_state.side_effect = RuntimeError
                 with testtools.ExpectedException(RuntimeError):
                     dhcp._periodic_resync_helper()
-                sync_state.assert_called_once_with()
+                sync_state.assert_called_once_with(['a', 'b'])
                 sleep.assert_called_once_with(dhcp.conf.resync_interval)
                 self.assertEqual(len(dhcp.needs_resync_reasons), 0)