backup init_host cleanup exception handling

author Tom Barron <tpb@dyncloud.net>

Wed, 26 Aug 2015 20:07:00 +0000 (16:07 -0400)

committer Tom Barron <tpb@dyncloud.net>

Wed, 9 Sep 2015 19:13:41 +0000 (19:13 +0000)
author Tom Barron <tpb@dyncloud.net>
Wed, 26 Aug 2015 20:07:00 +0000 (16:07 -0400)
committer Tom Barron <tpb@dyncloud.net>
Wed, 9 Sep 2015 19:13:41 +0000 (19:13 +0000)
diff --git a/cinder/backup/manager.py b/cinder/backup/manager.py

index 2e3e19458dab84b38c7e63cde458855537e2a44d..43bc2283e2cb112335542f1956c1306e4b6c031d 100644 (file)
--- a/cinder/backup/manager.py
+++ b/cinder/backup/manager.py
@@ -199,113 +199,147 @@ class BackupManager(manager.SchedulerDependentManager):
          for mgr in self.volume_managers.values():
              self._init_volume_driver(ctxt, mgr.driver)
  
+        try:
+            self._cleanup_incomplete_backup_operations(ctxt)
+        except Exception:
+            # Don't block startup of the backup service.
+            LOG.exception(_LE("Problem cleaning incomplete backup "
+                              "operations."))
+
+    def _cleanup_incomplete_backup_operations(self, ctxt):
          LOG.info(_LI("Cleaning up incomplete backup operations."))
          volumes = self.db.volume_get_all_by_host(ctxt, self.host)
+
          for volume in volumes:
-            volume_host = volume_utils.extract_host(volume['host'], 'backend')
-            backend = self._get_volume_backend(host=volume_host)
-            mgr = self._get_manager(backend)
-            if volume['status'] == 'backing-up':
-                self._detach_all_attachments(ctxt, mgr, volume)
-                LOG.info(_LI('Resetting volume %(vol_id)s to previous '
-                             'status %(status)s (was backing-up).'),
-                         {'vol_id': volume['id'],
-                          'status': volume['previous_status']})
-                self.db.volume_update(ctxt, volume['id'],
-                                      {'status': volume['previous_status']})
-            elif volume['status'] == 'restoring-backup':
-                self._detach_all_attachments(ctxt, mgr, volume)
-                LOG.info(_LI('setting volume %s to error_restoring '
-                             '(was restoring-backup).'), volume['id'])
-                self.db.volume_update(ctxt, volume['id'],
-                                      {'status': 'error_restoring'})
+            try:
+                self._cleanup_one_volume(ctxt, volume)
+            except Exception:
+                LOG.exception(_LE("Problem cleaning up volume %(vol)s."),
+                              {'vol': volume['id']})
  
          # TODO(smulcahy) implement full resume of backup and restore
          # operations on restart (rather than simply resetting)
          backups = objects.BackupList.get_all_by_host(ctxt, self.host)
          for backup in backups:
-            if backup['status'] == 'creating':
-                LOG.info(_LI('Resetting backup %s to error (was creating).'),
-                         backup['id'])
-                err = 'incomplete backup reset on manager restart'
-                self._update_backup_error(backup, ctxt, err)
-            if backup['status'] == 'restoring':
-                LOG.info(_LI('Resetting backup %s to '
-                             'available (was restoring).'),
-                         backup['id'])
-                backup.status = 'available'
-                backup.save()
-            if backup['status'] == 'deleting':
-                LOG.info(_LI('Resuming delete on backup: %s.'), backup['id'])
-                if CONF.backup_service_inithost_offload:
-                    # Offload all the pending backup delete operations to the
-                    # threadpool to prevent the main backup service thread
-                    # from being blocked.
-                    self._add_to_threadpool(self.delete_backup, ctxt, backup)
-                else:
-                    # By default, delete backups sequentially
-                    self.delete_backup(ctxt, backup)
-
-        self._cleanup_temp_volumes_snapshots(backups)
+            try:
+                self._cleanup_one_backup(ctxt, backup)
+            except Exception:
+                LOG.exception(_LE("Problem cleaning up backup %(bkup)s."),
+                              {'bkup': backup['id']})
+            try:
+                self._cleanup_temp_volumes_snapshots_for_one_backup(ctxt,
+                                                                    backup)
+            except Exception:
+                LOG.exception(_LE("Problem cleaning temp volumes and "
+                                  "snapshots for backup %(bkup)s."),
+                              {'bkup': backup['id']})
+
+    def _cleanup_one_volume(self, ctxt, volume):
+        volume_host = volume_utils.extract_host(volume['host'], 'backend')
+        backend = self._get_volume_backend(host=volume_host)
+        mgr = self._get_manager(backend)
+        if volume['status'] == 'backing-up':
+            self._detach_all_attachments(ctxt, mgr, volume)
+            LOG.info(_LI('Resetting volume %(vol_id)s to previous '
+                         'status %(status)s (was backing-up).'),
+                     {'vol_id': volume['id'],
+                      'status': volume['previous_status']})
+            self.db.volume_update(ctxt, volume['id'],
+                                  {'status': volume['previous_status']})
+        elif volume['status'] == 'restoring-backup':
+            self._detach_all_attachments(ctxt, mgr, volume)
+            LOG.info(_LI('setting volume %s to error_restoring '
+                         '(was restoring-backup).'), volume['id'])
+            self.db.volume_update(ctxt, volume['id'],
+                                  {'status': 'error_restoring'})
+
+    def _cleanup_one_backup(self, ctxt, backup):
+        if backup['status'] == 'creating':
+            LOG.info(_LI('Resetting backup %s to error (was creating).'),
+                     backup['id'])
+            err = 'incomplete backup reset on manager restart'
+            self._update_backup_error(backup, ctxt, err)
+        if backup['status'] == 'restoring':
+            LOG.info(_LI('Resetting backup %s to '
+                         'available (was restoring).'),
+                     backup['id'])
+            backup.status = 'available'
+            backup.save()
+        if backup['status'] == 'deleting':
+            LOG.info(_LI('Resuming delete on backup: %s.'), backup['id'])
+            if CONF.backup_service_inithost_offload:
+                # Offload all the pending backup delete operations to the
+                # threadpool to prevent the main backup service thread
+                # from being blocked.
+                self._add_to_threadpool(self.delete_backup, ctxt, backup)
+            else:
+                # By default, delete backups sequentially
+                self.delete_backup(ctxt, backup)
  
      def _detach_all_attachments(self, ctxt, mgr, volume):
          attachments = volume['volume_attachment'] or []
          for attachment in attachments:
              if (attachment['attached_host'] == self.host and
                      attachment['instance_uuid'] is None):
-                        mgr.detach_volume(ctxt, volume['id'],
-                                          attachment['id'])
-
-    def _cleanup_temp_volumes_snapshots(self, backups):
+                        try:
+                            mgr.detach_volume(ctxt, volume['id'],
+                                              attachment['id'])
+                        except Exception:
+                            LOG.exception(_LE("Detach attachment %(attach_id)s"
+                                              " failed."),
+                                          {'attach_id': attachment['id']},
+                                          resource=volume)
+
+    def _cleanup_temp_volumes_snapshots_for_one_backup(self, ctxt, backup):
          # NOTE(xyang): If the service crashes or gets restarted during the
          # backup operation, there could be temporary volumes or snapshots
          # that are not deleted. Make sure any temporary volumes or snapshots
          # create by the backup job are deleted when service is started.
-        ctxt = context.get_admin_context()
-        for backup in backups:
+        try:
+            volume = self.db.volume_get(ctxt, backup.volume_id)
+            volume_host = volume_utils.extract_host(volume['host'],
+                                                    'backend')
+            backend = self._get_volume_backend(host=volume_host)
+            mgr = self._get_manager(backend)
+        except (KeyError, exception.VolumeNotFound):
+            LOG.debug("Could not find a volume to clean up for "
+                      "backup %s.", backup.id)
+            return
+
+        if backup.temp_volume_id and backup.status == 'error':
              try:
-                volume = self.db.volume_get(ctxt, backup.volume_id)
-                volume_host = volume_utils.extract_host(volume['host'],
-                                                        'backend')
-                backend = self._get_volume_backend(host=volume_host)
-                mgr = self._get_manager(backend)
-            except (KeyError, exception.VolumeNotFound):
-                LOG.debug("Could not find a volume to clean up for "
-                          "backup %s.", backup.id)
-                continue
-            if backup.temp_volume_id and backup.status == 'error':
-                try:
-                    temp_volume = self.db.volume_get(ctxt,
-                                                     backup.temp_volume_id)
-                    # The temp volume should be deleted directly thru the
-                    # the volume driver, not thru the volume manager.
-                    mgr.driver.delete_volume(temp_volume)
-                    self.db.volume_destroy(ctxt, temp_volume['id'])
-                except exception.VolumeNotFound:
-                    LOG.debug("Could not find temp volume %(vol)s to clean up "
-                              "for backup %(backup)s.",
-                              {'vol': backup.temp_volume_id,
-                               'backup': backup.id})
-                backup.temp_volume_id = None
-                backup.save()
-            if backup.temp_snapshot_id and backup.status == 'error':
-                try:
-                    temp_snapshot = objects.Snapshot.get_by_id(
-                        ctxt, backup.temp_snapshot_id)
-                    # The temp snapshot should be deleted directly thru the
-                    # volume driver, not thru the volume manager.
-                    mgr.driver.delete_snapshot(temp_snapshot)
-                    with temp_snapshot.obj_as_admin():
-                        self.db.volume_glance_metadata_delete_by_snapshot(
-                            ctxt, temp_snapshot.id)
-                        temp_snapshot.destroy()
-                except exception.SnapshotNotFound:
-                    LOG.debug("Could not find temp snapshot %(snap)s to clean "
-                              "up for backup %(backup)s.",
-                              {'snap': backup.temp_snapshot_id,
-                               'backup': backup.id})
-                backup.temp_snapshot_id = None
-                backup.save()
+                temp_volume = self.db.volume_get(ctxt,
+                                                 backup.temp_volume_id)
+                # The temp volume should be deleted directly thru the
+                # the volume driver, not thru the volume manager.
+                mgr.driver.delete_volume(temp_volume)
+                self.db.volume_destroy(ctxt, temp_volume['id'])
+            except exception.VolumeNotFound:
+                LOG.debug("Could not find temp volume %(vol)s to clean up "
+                          "for backup %(backup)s.",
+                          {'vol': backup.temp_volume_id,
+                           'backup': backup.id})
+            backup.temp_volume_id = None
+            backup.save()
+
+        if backup.temp_snapshot_id and backup.status == 'error':
+            try:
+                temp_snapshot = objects.Snapshot.get_by_id(
+                    ctxt, backup.temp_snapshot_id)
+                # The temp snapshot should be deleted directly thru the
+                # volume driver, not thru the volume manager.
+                mgr.driver.delete_snapshot(temp_snapshot)
+                with temp_snapshot.obj_as_admin():
+                    self.db.volume_glance_metadata_delete_by_snapshot(
+                        ctxt, temp_snapshot.id)
+                    temp_snapshot.destroy()
+            except exception.SnapshotNotFound:
+                LOG.debug("Could not find temp snapshot %(snap)s to clean "
+                          "up for backup %(backup)s.",
+                          {'snap': backup.temp_snapshot_id,
+                           'backup': backup.id})
+            backup.temp_snapshot_id = None
+            backup.save()
  
      def create_backup(self, context, backup):
          """Create volume backups using configured backup service."""
diff --git a/cinder/tests/unit/test_backup.py b/cinder/tests/unit/test_backup.py

index 3831f9c031262beeb98a7318b9a4bc57b65024f7..a0c3a5b0bd16b86d7b7bb881f280cf08ae5989a5 100644 (file)
--- a/cinder/tests/unit/test_backup.py
+++ b/cinder/tests/unit/test_backup.py
@@ -222,6 +222,7 @@ class BackupTestCase(BaseBackupTest):
                                       temp_snapshot_id=temp_snap.id)
  
          self.backup_mgr.init_host()
+
          vol1 = db.volume_get(self.ctxt, vol1_id)
          self.assertEqual('available', vol1['status'])
          vol2 = db.volume_get(self.ctxt, vol2_id)
@@ -269,55 +270,201 @@ class BackupTestCase(BaseBackupTest):
          mock_add_threadpool.assert_has_calls(calls, any_order=True)
          self.assertEqual(2, mock_add_threadpool.call_count)
  
-    @mock.patch.object(db, 'volume_get')
+    def test_init_host_handles_exception(self):
+        """Test that exception in cleanup is handled."""
+
+        self.mock_object(self.backup_mgr, '_init_volume_driver')
+        mock_cleanup = self.mock_object(
+            self.backup_mgr,
+            '_cleanup_incomplete_backup_operations')
+        mock_cleanup.side_effect = [Exception]
+
+        self.assertIsNone(self.backup_mgr.init_host())
+
+    def test_cleanup_incomplete_backup_operations_with_exceptions(self):
+        """Test cleanup resilience in the face of exceptions."""
+
+        fake_volume_list = [{'id': 'vol1'}, {'id': 'vol2'}]
+        mock_volume_get_by_host = self.mock_object(
+            db, 'volume_get_all_by_host')
+        mock_volume_get_by_host.return_value = fake_volume_list
+
+        mock_volume_cleanup = self.mock_object(
+            self.backup_mgr, '_cleanup_one_volume')
+        mock_volume_cleanup.side_effect = [Exception]
+
+        fake_backup_list = [{'id': 'bkup1'}, {'id': 'bkup2'}, {'id': 'bkup3'}]
+        mock_backup_get_by_host = self.mock_object(
+            objects.BackupList, 'get_all_by_host')
+        mock_backup_get_by_host.return_value = fake_backup_list
+
+        mock_backup_cleanup = self.mock_object(
+            self.backup_mgr, '_cleanup_one_backup')
+        mock_backup_cleanup.side_effect = [Exception]
+
+        mock_temp_cleanup = self.mock_object(
+            self.backup_mgr, '_cleanup_temp_volumes_snapshots_for_one_backup')
+        mock_temp_cleanup.side_effect = [Exception]
+
+        self.assertIsNone(
+            self.backup_mgr._cleanup_incomplete_backup_operations(
+                self.ctxt))
+
+        self.assertEqual(len(fake_volume_list), mock_volume_cleanup.call_count)
+        self.assertEqual(len(fake_backup_list), mock_backup_cleanup.call_count)
+        self.assertEqual(len(fake_backup_list), mock_temp_cleanup.call_count)
+
+    def test_cleanup_one_backing_up_volume(self):
+        """Test cleanup_one_volume for volume status 'backing-up'."""
+
+        mock_get_manager = self.mock_object(
+            self.backup_mgr, '_get_manager')
+        mock_get_manager.return_value = 'fake_manager'
+
+        volume_id = self._create_volume_db_entry(status='backing-up',
+                                                 previous_status='available')
+        volume = db.volume_get(self.ctxt, volume_id)
+
+        self.backup_mgr._cleanup_one_volume(self.ctxt, volume)
+
+        volume = db.volume_get(self.ctxt, volume_id)
+        self.assertEqual('available', volume['status'])
+
+    def test_cleanup_one_restoring_backup_volume(self):
+        """Test cleanup_one_volume for volume status 'restoring-backup'."""
+
+        mock_get_manager = self.mock_object(
+            self.backup_mgr, '_get_manager')
+        mock_get_manager.return_value = 'fake_manager'
+
+        volume_id = self._create_volume_db_entry(status='restoring-backup')
+        volume = db.volume_get(self.ctxt, volume_id)
+
+        self.backup_mgr._cleanup_one_volume(self.ctxt, volume)
+
+        volume = db.volume_get(self.ctxt, volume_id)
+        self.assertEqual('error_restoring', volume['status'])
+
+    def test_cleanup_one_creating_backup(self):
+        """Test cleanup_one_backup for volume status 'creating'."""
+
+        backup = self._create_backup_db_entry(status='creating')
+
+        self.backup_mgr._cleanup_one_backup(self.ctxt, backup)
+
+        self.assertEqual('error', backup.status)
+
+    def test_cleanup_one_restoring_backup(self):
+        """Test cleanup_one_backup for volume status 'restoring'."""
+
+        backup = self._create_backup_db_entry(status='restoring')
+
+        self.backup_mgr._cleanup_one_backup(self.ctxt, backup)
+
+        self.assertEqual('available', backup.status)
+
+    def test_cleanup_one_deleting_backup(self):
+        """Test cleanup_one_backup for volume status 'deleting'."""
+
+        backup = self._create_backup_db_entry(status='deleting')
+
+        self.backup_mgr._cleanup_one_backup(self.ctxt, backup)
+
+        self.assertRaises(exception.BackupNotFound,
+                          db.backup_get,
+                          self.ctxt,
+                          backup.id)
+
+    def test_detach_all_attachments_handles_exceptions(self):
+        """Test detach_all_attachments with exceptions."""
+
+        mock_log = self.mock_object(manager, 'LOG')
+        mock_volume_mgr = mock.Mock()
+        mock_detach_volume = mock_volume_mgr.detach_volume
+        mock_detach_volume.side_effect = [Exception]
+
+        fake_attachments = [
+            {
+                'id': 'attachment1',
+                'attached_host': 'testhost',
+                'instance_uuid': None,
+            },
+            {
+                'id': 'attachment2',
+                'attached_host': 'testhost',
+                'instance_uuid': None,
+            }
+        ]
+        fake_volume = {
+            'id': 'fake_volume_id',
+            'volume_attachment': fake_attachments
+        }
+
+        self.backup_mgr._detach_all_attachments(self.ctxt,
+                                                mock_volume_mgr,
+                                                fake_volume)
+
+        self.assertEqual(len(fake_attachments), mock_log.exception.call_count)
+
      @ddt.data(KeyError, exception.VolumeNotFound)
-    def test_cleanup_temp_volumes_snapshots_volume_not_found(
-            self, err, mock_volume_get):
+    def test_cleanup_temp_volumes_snapshots_for_one_backup_volume_not_found(
+            self, err):
          """Ensure we handle missing volume for a backup."""
+
+        mock_volume_get = self.mock_object(db, 'volume_get')
          mock_volume_get.side_effect = [err]
  
-        backup1 = self._create_backup_db_entry(status='creating')
-        backups = [backup1]
+        backup = self._create_backup_db_entry(status='creating')
  
-        self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots(
-            backups))
+        self.assertIsNone(
+            self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup(
+                self.ctxt,
+                backup))
  
-    @mock.patch.object(lvm.LVMVolumeDriver, 'delete_snapshot')
-    def test_cleanup_temp_snapshot_not_found(self,
-                                             mock_delete_snapshot):
+    def test_cleanup_temp_snapshot_for_one_backup_not_found(self):
          """Ensure we handle missing temp snapshot for a backup."""
+        mock_delete_snapshot = self.mock_object(
+            lvm.LVMVolumeDriver, 'delete_snapshot')
+
          vol1_id = self._create_volume_db_entry()
          self._create_volume_attach(vol1_id)
          db.volume_update(self.ctxt, vol1_id, {'status': 'backing-up'})
-        backup1 = self._create_backup_db_entry(status='error',
-                                               volume_id=vol1_id,
-                                               temp_snapshot_id='fake')
-        backups = [backup1]
-        self.assertEqual('fake', backups[0].temp_snapshot_id)
-        self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots(
-            backups))
+        backup = self._create_backup_db_entry(status='error',
+                                              volume_id=vol1_id,
+                                              temp_snapshot_id='fake')
+
+        self.assertIsNone(
+            self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup(
+                self.ctxt,
+                backup))
+
          self.assertFalse(mock_delete_snapshot.called)
-        self.assertIsNone(backups[0].temp_snapshot_id)
-        backup1.destroy()
+        self.assertIsNone(backup.temp_snapshot_id)
+
+        backup.destroy()
          db.volume_destroy(self.ctxt, vol1_id)
  
-    @mock.patch.object(lvm.LVMVolumeDriver, 'delete_volume')
-    def test_cleanup_temp_volume_not_found(self,
-                                           mock_delete_volume):
+    def test_cleanup_temp_volume_for_one_backup_not_found(self):
          """Ensure we handle missing temp volume for a backup."""
+        mock_delete_volume = self.mock_object(
+            lvm.LVMVolumeDriver, 'delete_volume')
+
          vol1_id = self._create_volume_db_entry()
          self._create_volume_attach(vol1_id)
          db.volume_update(self.ctxt, vol1_id, {'status': 'backing-up'})
-        backup1 = self._create_backup_db_entry(status='error',
-                                               volume_id=vol1_id,
-                                               temp_volume_id='fake')
-        backups = [backup1]
-        self.assertEqual('fake', backups[0].temp_volume_id)
-        self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots(
-            backups))
+        backup = self._create_backup_db_entry(status='error',
+                                              volume_id=vol1_id,
+                                              temp_volume_id='fake')
+
+        self.assertIsNone(
+            self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup(
+                self.ctxt,
+                backup))
+
          self.assertFalse(mock_delete_volume.called)
-        self.assertIsNone(backups[0].temp_volume_id)
-        backup1.destroy()
+        self.assertIsNone(backup.temp_volume_id)
+
+        backup.destroy()
          db.volume_destroy(self.ctxt, vol1_id)
  
      def test_create_backup_with_bad_volume_status(self):
author	Tom Barron <tpb@dyncloud.net>
	Wed, 26 Aug 2015 20:07:00 +0000 (16:07 -0400)
committer	Tom Barron <tpb@dyncloud.net>
	Wed, 9 Sep 2015 19:13:41 +0000 (19:13 +0000)
cinder/backup/manager.py		patch \| blob \| history
cinder/tests/unit/test_backup.py		patch \| blob \| history