From 94b3288fe5430061eca0f2b48ce5bab7ce376f30 Mon Sep 17 00:00:00 2001 From: Jordan Pittier Date: Wed, 17 Feb 2016 19:05:49 +0100 Subject: [PATCH] Fix race condition in RemoteFS create_snapshot_online There's a `while True` loop in create_snapshot_online and each iteration can make the `cinder-volume` process sleep up to 10sec. In the mean time, if Nova fails to create the snapshot, the snapshot status could turn to "error" and someone could want to delete the erroneous snapshot, which would make its status to be "deleting". In that case the `while True` loop would never exit. Cinder consumes 100% of CPU and keeps logging "Status of snapshot XX is now deleting". The patch fix this issue by exiting the `while True` loop if we detect that the snapshot is to be deleted. Closes-Bug: #1538496 Change-Id: I5de0e8479a552ce101cecd06a874a170e54d5c18 --- cinder/exception.py | 5 +++++ cinder/tests/unit/test_remotefs.py | 33 ++++++++++++++++++++++++++++++ cinder/volume/drivers/remotefs.py | 13 +++++++++--- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/cinder/exception.py b/cinder/exception.py index 88b3c3da9..ffdc447ba 100644 --- a/cinder/exception.py +++ b/cinder/exception.py @@ -767,6 +767,11 @@ class RemoteFSException(VolumeDriverException): message = _("Unknown RemoteFS exception") +class RemoteFSConcurrentRequest(RemoteFSException): + message = _("A concurrent, possibly contradictory, request " + "has been made.") + + class RemoteFSNoSharesMounted(RemoteFSException): message = _("No mounted shares found") diff --git a/cinder/tests/unit/test_remotefs.py b/cinder/tests/unit/test_remotefs.py index df077b87e..82c985bad 100644 --- a/cinder/tests/unit/test_remotefs.py +++ b/cinder/tests/unit/test_remotefs.py @@ -40,6 +40,7 @@ class RemoteFsSnapDriverTestCase(test.TestCase): _FAKE_SNAPSHOT = {'context': _FAKE_CONTEXT, 'id': _FAKE_SNAPSHOT_ID, 'volume': _FAKE_VOLUME, + 'volume_id': _FAKE_VOLUME_ID, 'status': 'available', 'volume_size': 1} _FAKE_SNAPSHOT_PATH = (_FAKE_VOLUME_PATH + '.' + _FAKE_SNAPSHOT_ID) @@ -271,6 +272,38 @@ class RemoteFsSnapDriverTestCase(test.TestCase): self._driver._create_snapshot, fake_snapshot) + @mock.patch('cinder.db.snapshot_get') + @mock.patch('time.sleep') + def test_create_snapshot_online_with_concurrent_delete( + self, mock_sleep, mock_snapshot_get): + self._driver._nova = mock.Mock() + + # Test what happens when progress is so slow that someone + # decides to delete the snapshot while the last known status is + # "creating". + mock_snapshot_get.side_effect = [ + {'status': 'creating', 'progress': '42%'}, + {'status': 'creating', 'progress': '45%'}, + {'status': 'deleting'}, + ] + + with mock.patch.object(self._driver, '_do_create_snapshot') as \ + mock_do_create_snapshot: + self.assertRaises(exception.RemoteFSConcurrentRequest, + self._driver._create_snapshot_online, + self._FAKE_SNAPSHOT, + self._FAKE_VOLUME_NAME, + self._FAKE_SNAPSHOT_PATH) + + mock_do_create_snapshot.assert_called_once_with( + self._FAKE_SNAPSHOT, self._FAKE_VOLUME_NAME, + self._FAKE_SNAPSHOT_PATH) + self.assertEqual([mock.call(1), mock.call(1)], + mock_sleep.call_args_list) + self.assertEqual(3, mock_snapshot_get.call_count) + mock_snapshot_get.assert_called_with(self._FAKE_SNAPSHOT['context'], + self._FAKE_SNAPSHOT['id']) + @mock.patch.object(utils, 'synchronized') def _locked_volume_operation_test_helper(self, mock_synchronized, func, expected_exception=False, diff --git a/cinder/volume/drivers/remotefs.py b/cinder/volume/drivers/remotefs.py index 24a403c2a..f8b4870fe 100644 --- a/cinder/volume/drivers/remotefs.py +++ b/cinder/volume/drivers/remotefs.py @@ -1287,6 +1287,10 @@ class RemoteFSSnapDriver(RemoteFSDriver, driver.SnapshotVD): while True: s = db.snapshot_get(context, snapshot['id']) + LOG.debug('Status of snapshot %(id)s is now %(status)s', + {'id': snapshot['id'], + 'status': s['status']}) + if s['status'] == 'creating': if s['progress'] == '90%': # Nova tasks completed successfully @@ -1300,9 +1304,12 @@ class RemoteFSSnapDriver(RemoteFSDriver, driver.SnapshotVD): 'while creating snapshot.') raise exception.RemoteFSException(msg) - LOG.debug('Status of snapshot %(id)s is now %(status)s', - {'id': snapshot['id'], - 'status': s['status']}) + elif s['status'] == 'deleting' or s['status'] == 'error_deleting': + msg = _('Snapshot %(id)s has been asked to be deleted while ' + 'waiting for it to become available. Perhaps a ' + 'concurrent request was made.') % {'id': + snapshot['id']} + raise exception.RemoteFSConcurrentRequest(msg) if 10 < seconds_elapsed <= 20: increment = 2 -- 2.45.2