]> review.fuel-infra Code Review - openstack-build/neutron-build.git/commitdiff
Record and log reason for dhcp agent resync
authorIan Wienand <iwienand@redhat.com>
Tue, 18 Mar 2014 05:04:49 +0000 (16:04 +1100)
committerIan Wienand <iwienand@redhat.com>
Tue, 6 May 2014 04:44:43 +0000 (14:44 +1000)
A dhcp resync can be triggered at a number of points, but the actual
resync is done asynchronously by a helper thread.  This means by the
time the resync happens, it's hard to establish what actually caused
it.

There have been a number of problems in production systems that cause
excessive resyncs.  One is a ipv6/dnsmasq issue (rhbz#1077487) and
another is db corruption with duplicate entries [1].  The resync
triggers a whole lot of logs itself, so it becomes very unclear how to
establish any causality.

This change keeps track of what triggered the resync with some helpful
information.  We add a schedule_resync() function to to replace a
explicit set of needs_resync which tracks why it was called.

[1] The logs will contain output like "DBDuplicateEntry
(IntegrityError) (1062, "Duplicate entry
'6d799c6a-7a09-4c1e-bb63-7d30fd052c8a-d3e3ac5b-9962-428a-a9f8-6b2' for
key 'PRIMARY'") ..." in this case

Closes-Bug: #1294445
Change-Id: I9b1c6202f5a6bbad8589a8b64b2a38c9d9edb43b

neutron/agent/dhcp_agent.py
neutron/tests/unit/test_dhcp_agent.py

index 80846bc7d7663da3f2fd8f2cf19b2793d83280d4..42c95a02f4075890275b55c0f63cd1e270c43f2d 100644 (file)
@@ -67,7 +67,7 @@ class DhcpAgent(manager.Manager):
 
     def __init__(self, host=None):
         super(DhcpAgent, self).__init__(host=host)
-        self.needs_resync = False
+        self.needs_resync_reasons = []
         self.conf = cfg.CONF
         self.cache = NetworkCache()
         self.root_helper = config.get_root_helper(self.conf)
@@ -135,7 +135,7 @@ class DhcpAgent(manager.Manager):
                           'that the network and/or its subnet(s) still exist.')
                         % {'net_id': network.id, 'action': action})
         except Exception as e:
-            self.needs_resync = True
+            self.schedule_resync(e)
             if (isinstance(e, common.RemoteError)
                 and e.exc_type == 'NetworkNotFound'
                 or isinstance(e, exceptions.NetworkNotFound)):
@@ -144,6 +144,10 @@ class DhcpAgent(manager.Manager):
                 LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.')
                               % {'net_id': network.id, 'action': action})
 
+    def schedule_resync(self, reason):
+        """Schedule a resync for a given reason."""
+        self.needs_resync_reasons.append(reason)
+
     @utils.synchronized('dhcp-agent')
     def sync_state(self):
         """Sync the local DHCP state with Neutron."""
@@ -157,8 +161,8 @@ class DhcpAgent(manager.Manager):
             for deleted_id in known_network_ids - active_network_ids:
                 try:
                     self.disable_dhcp_helper(deleted_id)
-                except Exception:
-                    self.needs_resync = True
+                except Exception as e:
+                    self.schedule_resync(e)
                     LOG.exception(_('Unable to sync network state on deleted '
                                     'network %s'), deleted_id)
 
@@ -167,16 +171,22 @@ class DhcpAgent(manager.Manager):
             pool.waitall()
             LOG.info(_('Synchronizing state complete'))
 
-        except Exception:
-            self.needs_resync = True
+        except Exception as e:
+            self.schedule_resync(e)
             LOG.exception(_('Unable to sync network state.'))
 
     def _periodic_resync_helper(self):
         """Resync the dhcp state at the configured interval."""
         while True:
             eventlet.sleep(self.conf.resync_interval)
-            if self.needs_resync:
-                self.needs_resync = False
+            if self.needs_resync_reasons:
+                # be careful to avoid a race with additions to list
+                # from other threads
+                reasons = self.needs_resync_reasons
+                self.needs_resync_reasons = []
+                for r in reasons:
+                    LOG.debug(_("resync: %(reason)s"),
+                              {"reason": r})
                 self.sync_state()
 
     def periodic_resync(self):
@@ -189,8 +199,8 @@ class DhcpAgent(manager.Manager):
             if not network:
                 LOG.warn(_('Network %s has been deleted.'), network_id)
             return network
-        except Exception:
-            self.needs_resync = True
+        except Exception as e:
+            self.schedule_resync(e)
             LOG.exception(_('Network %s info call failed.'), network_id)
 
     def enable_dhcp_helper(self, network_id):
@@ -579,7 +589,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
 
     def agent_updated(self, context, payload):
         """Handle the agent_updated notification event."""
-        self.needs_resync = True
+        self.schedule_resync(_("Agent updated: %(payload)s") %
+                             {"payload": payload})
         LOG.info(_("agent_updated by server side %s!"), payload)
 
     def after_start(self):
index 5978874cf32c153ad26961808c4aba3d99e89386..5b9c447873861a323053456fd0cc56b4dfceacff 100644 (file)
@@ -211,14 +211,16 @@ class TestDhcpAgent(base.BaseTestCase):
         self.driver.return_value.foo.side_effect = exc or Exception
         with mock.patch.object(dhcp_agent.LOG, trace_level) as log:
             dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
-            self.assertIsNone(dhcp.call_driver('foo', network))
-            self.driver.assert_called_once_with(cfg.CONF,
-                                                mock.ANY,
-                                                'sudo',
-                                                mock.ANY,
-                                                mock.ANY)
-            self.assertEqual(log.call_count, 1)
-            self.assertEqual(expected_sync, dhcp.needs_resync)
+            with mock.patch.object(dhcp,
+                                   'schedule_resync') as schedule_resync:
+                self.assertIsNone(dhcp.call_driver('foo', network))
+                self.driver.assert_called_once_with(cfg.CONF,
+                                                    mock.ANY,
+                                                    'sudo',
+                                                    mock.ANY,
+                                                    mock.ANY)
+                self.assertEqual(log.call_count, 1)
+                self.assertEqual(expected_sync, schedule_resync.called)
 
     def test_call_driver_failure(self):
         self._test_call_driver_failure()
@@ -298,10 +300,12 @@ class TestDhcpAgent(base.BaseTestCase):
 
             with mock.patch.object(dhcp_agent.LOG, 'exception') as log:
                 dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
-                dhcp.sync_state()
+                with mock.patch.object(dhcp,
+                                       'schedule_resync') as schedule_resync:
+                    dhcp.sync_state()
 
-                self.assertTrue(log.called)
-                self.assertTrue(dhcp.needs_resync)
+                    self.assertTrue(log.called)
+                    self.assertTrue(schedule_resync.called)
 
     def test_periodic_resync(self):
         dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
@@ -312,14 +316,14 @@ class TestDhcpAgent(base.BaseTestCase):
     def test_periodoc_resync_helper(self):
         with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
             dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
-            dhcp.needs_resync = True
+            dhcp.needs_resync_reasons = ['reason1', 'reason2']
             with mock.patch.object(dhcp, 'sync_state') as sync_state:
                 sync_state.side_effect = RuntimeError
                 with testtools.ExpectedException(RuntimeError):
                     dhcp._periodic_resync_helper()
                 sync_state.assert_called_once_with()
                 sleep.assert_called_once_with(dhcp.conf.resync_interval)
-                self.assertFalse(dhcp.needs_resync)
+                self.assertEqual(len(dhcp.needs_resync_reasons), 0)
 
     def test_populate_cache_on_start_without_active_networks_support(self):
         # emul dhcp driver that doesn't support retrieving of active networks
@@ -468,14 +472,15 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
         self.mock_init_p = mock.patch('neutron.agent.dhcp_agent.'
                                       'DhcpAgent._populate_networks_cache')
         self.mock_init = self.mock_init_p.start()
-
         with mock.patch.object(dhcp.Dnsmasq,
                                'check_version') as check_v:
             check_v.return_value = dhcp.Dnsmasq.MINIMUM_VERSION
             self.dhcp = dhcp_agent.DhcpAgent(HOSTNAME)
         self.call_driver_p = mock.patch.object(self.dhcp, 'call_driver')
-
         self.call_driver = self.call_driver_p.start()
+        self.schedule_resync_p = mock.patch.object(self.dhcp,
+                                                   'schedule_resync')
+        self.schedule_resync = self.schedule_resync_p.start()
         self.external_process_p = mock.patch(
             'neutron.agent.linux.external_process.ProcessManager'
         )
@@ -525,7 +530,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
                 [mock.call.get_network_info('fake_id')])
             self.assertFalse(self.call_driver.called)
             self.assertTrue(log.called)
-            self.assertFalse(self.dhcp.needs_resync)
+            self.assertFalse(self.dhcp.schedule_resync.called)
 
     def test_enable_dhcp_helper_exception_during_rpc(self):
         self.plugin.get_network_info.side_effect = Exception
@@ -535,7 +540,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
                 [mock.call.get_network_info(fake_network.id)])
             self.assertFalse(self.call_driver.called)
             self.assertTrue(log.called)
-            self.assertTrue(self.dhcp.needs_resync)
+            self.assertTrue(self.schedule_resync.called)
             self.assertFalse(self.cache.called)
             self.assertFalse(self.external_process.called)
 
@@ -723,7 +728,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase):
             self.cache.assert_has_calls(
                 [mock.call.get_network_by_id('net-id')])
             self.assertTrue(log.called)
-            self.assertTrue(self.dhcp.needs_resync)
+            self.assertTrue(self.dhcp.schedule_resync.called)
 
     def test_subnet_update_end(self):
         payload = dict(subnet=dict(network_id=fake_network.id))