From: Ian Wienand Date: Tue, 18 Mar 2014 05:04:49 +0000 (+1100) Subject: Record and log reason for dhcp agent resync X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=77ddd463bd2e96041c0b6ecea1c46c4ac4d55851;p=openstack-build%2Fneutron-build.git Record and log reason for dhcp agent resync A dhcp resync can be triggered at a number of points, but the actual resync is done asynchronously by a helper thread. This means by the time the resync happens, it's hard to establish what actually caused it. There have been a number of problems in production systems that cause excessive resyncs. One is a ipv6/dnsmasq issue (rhbz#1077487) and another is db corruption with duplicate entries [1]. The resync triggers a whole lot of logs itself, so it becomes very unclear how to establish any causality. This change keeps track of what triggered the resync with some helpful information. We add a schedule_resync() function to to replace a explicit set of needs_resync which tracks why it was called. [1] The logs will contain output like "DBDuplicateEntry (IntegrityError) (1062, "Duplicate entry '6d799c6a-7a09-4c1e-bb63-7d30fd052c8a-d3e3ac5b-9962-428a-a9f8-6b2' for key 'PRIMARY'") ..." in this case Closes-Bug: #1294445 Change-Id: I9b1c6202f5a6bbad8589a8b64b2a38c9d9edb43b --- diff --git a/neutron/agent/dhcp_agent.py b/neutron/agent/dhcp_agent.py index 80846bc7d..42c95a02f 100644 --- a/neutron/agent/dhcp_agent.py +++ b/neutron/agent/dhcp_agent.py @@ -67,7 +67,7 @@ class DhcpAgent(manager.Manager): def __init__(self, host=None): super(DhcpAgent, self).__init__(host=host) - self.needs_resync = False + self.needs_resync_reasons = [] self.conf = cfg.CONF self.cache = NetworkCache() self.root_helper = config.get_root_helper(self.conf) @@ -135,7 +135,7 @@ class DhcpAgent(manager.Manager): 'that the network and/or its subnet(s) still exist.') % {'net_id': network.id, 'action': action}) except Exception as e: - self.needs_resync = True + self.schedule_resync(e) if (isinstance(e, common.RemoteError) and e.exc_type == 'NetworkNotFound' or isinstance(e, exceptions.NetworkNotFound)): @@ -144,6 +144,10 @@ class DhcpAgent(manager.Manager): LOG.exception(_('Unable to %(action)s dhcp for %(net_id)s.') % {'net_id': network.id, 'action': action}) + def schedule_resync(self, reason): + """Schedule a resync for a given reason.""" + self.needs_resync_reasons.append(reason) + @utils.synchronized('dhcp-agent') def sync_state(self): """Sync the local DHCP state with Neutron.""" @@ -157,8 +161,8 @@ class DhcpAgent(manager.Manager): for deleted_id in known_network_ids - active_network_ids: try: self.disable_dhcp_helper(deleted_id) - except Exception: - self.needs_resync = True + except Exception as e: + self.schedule_resync(e) LOG.exception(_('Unable to sync network state on deleted ' 'network %s'), deleted_id) @@ -167,16 +171,22 @@ class DhcpAgent(manager.Manager): pool.waitall() LOG.info(_('Synchronizing state complete')) - except Exception: - self.needs_resync = True + except Exception as e: + self.schedule_resync(e) LOG.exception(_('Unable to sync network state.')) def _periodic_resync_helper(self): """Resync the dhcp state at the configured interval.""" while True: eventlet.sleep(self.conf.resync_interval) - if self.needs_resync: - self.needs_resync = False + if self.needs_resync_reasons: + # be careful to avoid a race with additions to list + # from other threads + reasons = self.needs_resync_reasons + self.needs_resync_reasons = [] + for r in reasons: + LOG.debug(_("resync: %(reason)s"), + {"reason": r}) self.sync_state() def periodic_resync(self): @@ -189,8 +199,8 @@ class DhcpAgent(manager.Manager): if not network: LOG.warn(_('Network %s has been deleted.'), network_id) return network - except Exception: - self.needs_resync = True + except Exception as e: + self.schedule_resync(e) LOG.exception(_('Network %s info call failed.'), network_id) def enable_dhcp_helper(self, network_id): @@ -579,7 +589,8 @@ class DhcpAgentWithStateReport(DhcpAgent): def agent_updated(self, context, payload): """Handle the agent_updated notification event.""" - self.needs_resync = True + self.schedule_resync(_("Agent updated: %(payload)s") % + {"payload": payload}) LOG.info(_("agent_updated by server side %s!"), payload) def after_start(self): diff --git a/neutron/tests/unit/test_dhcp_agent.py b/neutron/tests/unit/test_dhcp_agent.py index 5978874cf..5b9c44787 100644 --- a/neutron/tests/unit/test_dhcp_agent.py +++ b/neutron/tests/unit/test_dhcp_agent.py @@ -211,14 +211,16 @@ class TestDhcpAgent(base.BaseTestCase): self.driver.return_value.foo.side_effect = exc or Exception with mock.patch.object(dhcp_agent.LOG, trace_level) as log: dhcp = dhcp_agent.DhcpAgent(HOSTNAME) - self.assertIsNone(dhcp.call_driver('foo', network)) - self.driver.assert_called_once_with(cfg.CONF, - mock.ANY, - 'sudo', - mock.ANY, - mock.ANY) - self.assertEqual(log.call_count, 1) - self.assertEqual(expected_sync, dhcp.needs_resync) + with mock.patch.object(dhcp, + 'schedule_resync') as schedule_resync: + self.assertIsNone(dhcp.call_driver('foo', network)) + self.driver.assert_called_once_with(cfg.CONF, + mock.ANY, + 'sudo', + mock.ANY, + mock.ANY) + self.assertEqual(log.call_count, 1) + self.assertEqual(expected_sync, schedule_resync.called) def test_call_driver_failure(self): self._test_call_driver_failure() @@ -298,10 +300,12 @@ class TestDhcpAgent(base.BaseTestCase): with mock.patch.object(dhcp_agent.LOG, 'exception') as log: dhcp = dhcp_agent.DhcpAgent(HOSTNAME) - dhcp.sync_state() + with mock.patch.object(dhcp, + 'schedule_resync') as schedule_resync: + dhcp.sync_state() - self.assertTrue(log.called) - self.assertTrue(dhcp.needs_resync) + self.assertTrue(log.called) + self.assertTrue(schedule_resync.called) def test_periodic_resync(self): dhcp = dhcp_agent.DhcpAgent(HOSTNAME) @@ -312,14 +316,14 @@ class TestDhcpAgent(base.BaseTestCase): def test_periodoc_resync_helper(self): with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep: dhcp = dhcp_agent.DhcpAgent(HOSTNAME) - dhcp.needs_resync = True + dhcp.needs_resync_reasons = ['reason1', 'reason2'] with mock.patch.object(dhcp, 'sync_state') as sync_state: sync_state.side_effect = RuntimeError with testtools.ExpectedException(RuntimeError): dhcp._periodic_resync_helper() sync_state.assert_called_once_with() sleep.assert_called_once_with(dhcp.conf.resync_interval) - self.assertFalse(dhcp.needs_resync) + self.assertEqual(len(dhcp.needs_resync_reasons), 0) def test_populate_cache_on_start_without_active_networks_support(self): # emul dhcp driver that doesn't support retrieving of active networks @@ -468,14 +472,15 @@ class TestDhcpAgentEventHandler(base.BaseTestCase): self.mock_init_p = mock.patch('neutron.agent.dhcp_agent.' 'DhcpAgent._populate_networks_cache') self.mock_init = self.mock_init_p.start() - with mock.patch.object(dhcp.Dnsmasq, 'check_version') as check_v: check_v.return_value = dhcp.Dnsmasq.MINIMUM_VERSION self.dhcp = dhcp_agent.DhcpAgent(HOSTNAME) self.call_driver_p = mock.patch.object(self.dhcp, 'call_driver') - self.call_driver = self.call_driver_p.start() + self.schedule_resync_p = mock.patch.object(self.dhcp, + 'schedule_resync') + self.schedule_resync = self.schedule_resync_p.start() self.external_process_p = mock.patch( 'neutron.agent.linux.external_process.ProcessManager' ) @@ -525,7 +530,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase): [mock.call.get_network_info('fake_id')]) self.assertFalse(self.call_driver.called) self.assertTrue(log.called) - self.assertFalse(self.dhcp.needs_resync) + self.assertFalse(self.dhcp.schedule_resync.called) def test_enable_dhcp_helper_exception_during_rpc(self): self.plugin.get_network_info.side_effect = Exception @@ -535,7 +540,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase): [mock.call.get_network_info(fake_network.id)]) self.assertFalse(self.call_driver.called) self.assertTrue(log.called) - self.assertTrue(self.dhcp.needs_resync) + self.assertTrue(self.schedule_resync.called) self.assertFalse(self.cache.called) self.assertFalse(self.external_process.called) @@ -723,7 +728,7 @@ class TestDhcpAgentEventHandler(base.BaseTestCase): self.cache.assert_has_calls( [mock.call.get_network_by_id('net-id')]) self.assertTrue(log.called) - self.assertTrue(self.dhcp.needs_resync) + self.assertTrue(self.dhcp.schedule_resync.called) def test_subnet_update_end(self): payload = dict(subnet=dict(network_id=fake_network.id))