From bda6da1f8df44a64e8b320ebd8e2211197b6afeb Mon Sep 17 00:00:00 2001 From: Tom Barron Date: Wed, 26 Aug 2015 16:07:00 -0400 Subject: [PATCH] backup init_host cleanup exception handling Current cleanup of leftover backups, volumes, and temporary volumes and snapshots is not resilient in the face of exceptions such as failures in detach operations. This commit adds appropriate try/except logic to this section of code, thereby ensuring that the backup service can start even if exceptions are encountered when attempting to cleanup detritus from earlier failed backup operations. Change-Id: Ie8d2107ebd4d86dfe5664c35687e8cbe58cfad9d Closes-bug: 1485295 --- cinder/backup/manager.py | 210 ++++++++++++++++++------------- cinder/tests/unit/test_backup.py | 209 +++++++++++++++++++++++++----- 2 files changed, 300 insertions(+), 119 deletions(-) diff --git a/cinder/backup/manager.py b/cinder/backup/manager.py index 2e3e19458..43bc2283e 100644 --- a/cinder/backup/manager.py +++ b/cinder/backup/manager.py @@ -199,113 +199,147 @@ class BackupManager(manager.SchedulerDependentManager): for mgr in self.volume_managers.values(): self._init_volume_driver(ctxt, mgr.driver) + try: + self._cleanup_incomplete_backup_operations(ctxt) + except Exception: + # Don't block startup of the backup service. + LOG.exception(_LE("Problem cleaning incomplete backup " + "operations.")) + + def _cleanup_incomplete_backup_operations(self, ctxt): LOG.info(_LI("Cleaning up incomplete backup operations.")) volumes = self.db.volume_get_all_by_host(ctxt, self.host) + for volume in volumes: - volume_host = volume_utils.extract_host(volume['host'], 'backend') - backend = self._get_volume_backend(host=volume_host) - mgr = self._get_manager(backend) - if volume['status'] == 'backing-up': - self._detach_all_attachments(ctxt, mgr, volume) - LOG.info(_LI('Resetting volume %(vol_id)s to previous ' - 'status %(status)s (was backing-up).'), - {'vol_id': volume['id'], - 'status': volume['previous_status']}) - self.db.volume_update(ctxt, volume['id'], - {'status': volume['previous_status']}) - elif volume['status'] == 'restoring-backup': - self._detach_all_attachments(ctxt, mgr, volume) - LOG.info(_LI('setting volume %s to error_restoring ' - '(was restoring-backup).'), volume['id']) - self.db.volume_update(ctxt, volume['id'], - {'status': 'error_restoring'}) + try: + self._cleanup_one_volume(ctxt, volume) + except Exception: + LOG.exception(_LE("Problem cleaning up volume %(vol)s."), + {'vol': volume['id']}) # TODO(smulcahy) implement full resume of backup and restore # operations on restart (rather than simply resetting) backups = objects.BackupList.get_all_by_host(ctxt, self.host) for backup in backups: - if backup['status'] == 'creating': - LOG.info(_LI('Resetting backup %s to error (was creating).'), - backup['id']) - err = 'incomplete backup reset on manager restart' - self._update_backup_error(backup, ctxt, err) - if backup['status'] == 'restoring': - LOG.info(_LI('Resetting backup %s to ' - 'available (was restoring).'), - backup['id']) - backup.status = 'available' - backup.save() - if backup['status'] == 'deleting': - LOG.info(_LI('Resuming delete on backup: %s.'), backup['id']) - if CONF.backup_service_inithost_offload: - # Offload all the pending backup delete operations to the - # threadpool to prevent the main backup service thread - # from being blocked. - self._add_to_threadpool(self.delete_backup, ctxt, backup) - else: - # By default, delete backups sequentially - self.delete_backup(ctxt, backup) - - self._cleanup_temp_volumes_snapshots(backups) + try: + self._cleanup_one_backup(ctxt, backup) + except Exception: + LOG.exception(_LE("Problem cleaning up backup %(bkup)s."), + {'bkup': backup['id']}) + try: + self._cleanup_temp_volumes_snapshots_for_one_backup(ctxt, + backup) + except Exception: + LOG.exception(_LE("Problem cleaning temp volumes and " + "snapshots for backup %(bkup)s."), + {'bkup': backup['id']}) + + def _cleanup_one_volume(self, ctxt, volume): + volume_host = volume_utils.extract_host(volume['host'], 'backend') + backend = self._get_volume_backend(host=volume_host) + mgr = self._get_manager(backend) + if volume['status'] == 'backing-up': + self._detach_all_attachments(ctxt, mgr, volume) + LOG.info(_LI('Resetting volume %(vol_id)s to previous ' + 'status %(status)s (was backing-up).'), + {'vol_id': volume['id'], + 'status': volume['previous_status']}) + self.db.volume_update(ctxt, volume['id'], + {'status': volume['previous_status']}) + elif volume['status'] == 'restoring-backup': + self._detach_all_attachments(ctxt, mgr, volume) + LOG.info(_LI('setting volume %s to error_restoring ' + '(was restoring-backup).'), volume['id']) + self.db.volume_update(ctxt, volume['id'], + {'status': 'error_restoring'}) + + def _cleanup_one_backup(self, ctxt, backup): + if backup['status'] == 'creating': + LOG.info(_LI('Resetting backup %s to error (was creating).'), + backup['id']) + err = 'incomplete backup reset on manager restart' + self._update_backup_error(backup, ctxt, err) + if backup['status'] == 'restoring': + LOG.info(_LI('Resetting backup %s to ' + 'available (was restoring).'), + backup['id']) + backup.status = 'available' + backup.save() + if backup['status'] == 'deleting': + LOG.info(_LI('Resuming delete on backup: %s.'), backup['id']) + if CONF.backup_service_inithost_offload: + # Offload all the pending backup delete operations to the + # threadpool to prevent the main backup service thread + # from being blocked. + self._add_to_threadpool(self.delete_backup, ctxt, backup) + else: + # By default, delete backups sequentially + self.delete_backup(ctxt, backup) def _detach_all_attachments(self, ctxt, mgr, volume): attachments = volume['volume_attachment'] or [] for attachment in attachments: if (attachment['attached_host'] == self.host and attachment['instance_uuid'] is None): - mgr.detach_volume(ctxt, volume['id'], - attachment['id']) - - def _cleanup_temp_volumes_snapshots(self, backups): + try: + mgr.detach_volume(ctxt, volume['id'], + attachment['id']) + except Exception: + LOG.exception(_LE("Detach attachment %(attach_id)s" + " failed."), + {'attach_id': attachment['id']}, + resource=volume) + + def _cleanup_temp_volumes_snapshots_for_one_backup(self, ctxt, backup): # NOTE(xyang): If the service crashes or gets restarted during the # backup operation, there could be temporary volumes or snapshots # that are not deleted. Make sure any temporary volumes or snapshots # create by the backup job are deleted when service is started. - ctxt = context.get_admin_context() - for backup in backups: + try: + volume = self.db.volume_get(ctxt, backup.volume_id) + volume_host = volume_utils.extract_host(volume['host'], + 'backend') + backend = self._get_volume_backend(host=volume_host) + mgr = self._get_manager(backend) + except (KeyError, exception.VolumeNotFound): + LOG.debug("Could not find a volume to clean up for " + "backup %s.", backup.id) + return + + if backup.temp_volume_id and backup.status == 'error': try: - volume = self.db.volume_get(ctxt, backup.volume_id) - volume_host = volume_utils.extract_host(volume['host'], - 'backend') - backend = self._get_volume_backend(host=volume_host) - mgr = self._get_manager(backend) - except (KeyError, exception.VolumeNotFound): - LOG.debug("Could not find a volume to clean up for " - "backup %s.", backup.id) - continue - if backup.temp_volume_id and backup.status == 'error': - try: - temp_volume = self.db.volume_get(ctxt, - backup.temp_volume_id) - # The temp volume should be deleted directly thru the - # the volume driver, not thru the volume manager. - mgr.driver.delete_volume(temp_volume) - self.db.volume_destroy(ctxt, temp_volume['id']) - except exception.VolumeNotFound: - LOG.debug("Could not find temp volume %(vol)s to clean up " - "for backup %(backup)s.", - {'vol': backup.temp_volume_id, - 'backup': backup.id}) - backup.temp_volume_id = None - backup.save() - if backup.temp_snapshot_id and backup.status == 'error': - try: - temp_snapshot = objects.Snapshot.get_by_id( - ctxt, backup.temp_snapshot_id) - # The temp snapshot should be deleted directly thru the - # volume driver, not thru the volume manager. - mgr.driver.delete_snapshot(temp_snapshot) - with temp_snapshot.obj_as_admin(): - self.db.volume_glance_metadata_delete_by_snapshot( - ctxt, temp_snapshot.id) - temp_snapshot.destroy() - except exception.SnapshotNotFound: - LOG.debug("Could not find temp snapshot %(snap)s to clean " - "up for backup %(backup)s.", - {'snap': backup.temp_snapshot_id, - 'backup': backup.id}) - backup.temp_snapshot_id = None - backup.save() + temp_volume = self.db.volume_get(ctxt, + backup.temp_volume_id) + # The temp volume should be deleted directly thru the + # the volume driver, not thru the volume manager. + mgr.driver.delete_volume(temp_volume) + self.db.volume_destroy(ctxt, temp_volume['id']) + except exception.VolumeNotFound: + LOG.debug("Could not find temp volume %(vol)s to clean up " + "for backup %(backup)s.", + {'vol': backup.temp_volume_id, + 'backup': backup.id}) + backup.temp_volume_id = None + backup.save() + + if backup.temp_snapshot_id and backup.status == 'error': + try: + temp_snapshot = objects.Snapshot.get_by_id( + ctxt, backup.temp_snapshot_id) + # The temp snapshot should be deleted directly thru the + # volume driver, not thru the volume manager. + mgr.driver.delete_snapshot(temp_snapshot) + with temp_snapshot.obj_as_admin(): + self.db.volume_glance_metadata_delete_by_snapshot( + ctxt, temp_snapshot.id) + temp_snapshot.destroy() + except exception.SnapshotNotFound: + LOG.debug("Could not find temp snapshot %(snap)s to clean " + "up for backup %(backup)s.", + {'snap': backup.temp_snapshot_id, + 'backup': backup.id}) + backup.temp_snapshot_id = None + backup.save() def create_backup(self, context, backup): """Create volume backups using configured backup service.""" diff --git a/cinder/tests/unit/test_backup.py b/cinder/tests/unit/test_backup.py index 3831f9c03..a0c3a5b0b 100644 --- a/cinder/tests/unit/test_backup.py +++ b/cinder/tests/unit/test_backup.py @@ -222,6 +222,7 @@ class BackupTestCase(BaseBackupTest): temp_snapshot_id=temp_snap.id) self.backup_mgr.init_host() + vol1 = db.volume_get(self.ctxt, vol1_id) self.assertEqual('available', vol1['status']) vol2 = db.volume_get(self.ctxt, vol2_id) @@ -269,55 +270,201 @@ class BackupTestCase(BaseBackupTest): mock_add_threadpool.assert_has_calls(calls, any_order=True) self.assertEqual(2, mock_add_threadpool.call_count) - @mock.patch.object(db, 'volume_get') + def test_init_host_handles_exception(self): + """Test that exception in cleanup is handled.""" + + self.mock_object(self.backup_mgr, '_init_volume_driver') + mock_cleanup = self.mock_object( + self.backup_mgr, + '_cleanup_incomplete_backup_operations') + mock_cleanup.side_effect = [Exception] + + self.assertIsNone(self.backup_mgr.init_host()) + + def test_cleanup_incomplete_backup_operations_with_exceptions(self): + """Test cleanup resilience in the face of exceptions.""" + + fake_volume_list = [{'id': 'vol1'}, {'id': 'vol2'}] + mock_volume_get_by_host = self.mock_object( + db, 'volume_get_all_by_host') + mock_volume_get_by_host.return_value = fake_volume_list + + mock_volume_cleanup = self.mock_object( + self.backup_mgr, '_cleanup_one_volume') + mock_volume_cleanup.side_effect = [Exception] + + fake_backup_list = [{'id': 'bkup1'}, {'id': 'bkup2'}, {'id': 'bkup3'}] + mock_backup_get_by_host = self.mock_object( + objects.BackupList, 'get_all_by_host') + mock_backup_get_by_host.return_value = fake_backup_list + + mock_backup_cleanup = self.mock_object( + self.backup_mgr, '_cleanup_one_backup') + mock_backup_cleanup.side_effect = [Exception] + + mock_temp_cleanup = self.mock_object( + self.backup_mgr, '_cleanup_temp_volumes_snapshots_for_one_backup') + mock_temp_cleanup.side_effect = [Exception] + + self.assertIsNone( + self.backup_mgr._cleanup_incomplete_backup_operations( + self.ctxt)) + + self.assertEqual(len(fake_volume_list), mock_volume_cleanup.call_count) + self.assertEqual(len(fake_backup_list), mock_backup_cleanup.call_count) + self.assertEqual(len(fake_backup_list), mock_temp_cleanup.call_count) + + def test_cleanup_one_backing_up_volume(self): + """Test cleanup_one_volume for volume status 'backing-up'.""" + + mock_get_manager = self.mock_object( + self.backup_mgr, '_get_manager') + mock_get_manager.return_value = 'fake_manager' + + volume_id = self._create_volume_db_entry(status='backing-up', + previous_status='available') + volume = db.volume_get(self.ctxt, volume_id) + + self.backup_mgr._cleanup_one_volume(self.ctxt, volume) + + volume = db.volume_get(self.ctxt, volume_id) + self.assertEqual('available', volume['status']) + + def test_cleanup_one_restoring_backup_volume(self): + """Test cleanup_one_volume for volume status 'restoring-backup'.""" + + mock_get_manager = self.mock_object( + self.backup_mgr, '_get_manager') + mock_get_manager.return_value = 'fake_manager' + + volume_id = self._create_volume_db_entry(status='restoring-backup') + volume = db.volume_get(self.ctxt, volume_id) + + self.backup_mgr._cleanup_one_volume(self.ctxt, volume) + + volume = db.volume_get(self.ctxt, volume_id) + self.assertEqual('error_restoring', volume['status']) + + def test_cleanup_one_creating_backup(self): + """Test cleanup_one_backup for volume status 'creating'.""" + + backup = self._create_backup_db_entry(status='creating') + + self.backup_mgr._cleanup_one_backup(self.ctxt, backup) + + self.assertEqual('error', backup.status) + + def test_cleanup_one_restoring_backup(self): + """Test cleanup_one_backup for volume status 'restoring'.""" + + backup = self._create_backup_db_entry(status='restoring') + + self.backup_mgr._cleanup_one_backup(self.ctxt, backup) + + self.assertEqual('available', backup.status) + + def test_cleanup_one_deleting_backup(self): + """Test cleanup_one_backup for volume status 'deleting'.""" + + backup = self._create_backup_db_entry(status='deleting') + + self.backup_mgr._cleanup_one_backup(self.ctxt, backup) + + self.assertRaises(exception.BackupNotFound, + db.backup_get, + self.ctxt, + backup.id) + + def test_detach_all_attachments_handles_exceptions(self): + """Test detach_all_attachments with exceptions.""" + + mock_log = self.mock_object(manager, 'LOG') + mock_volume_mgr = mock.Mock() + mock_detach_volume = mock_volume_mgr.detach_volume + mock_detach_volume.side_effect = [Exception] + + fake_attachments = [ + { + 'id': 'attachment1', + 'attached_host': 'testhost', + 'instance_uuid': None, + }, + { + 'id': 'attachment2', + 'attached_host': 'testhost', + 'instance_uuid': None, + } + ] + fake_volume = { + 'id': 'fake_volume_id', + 'volume_attachment': fake_attachments + } + + self.backup_mgr._detach_all_attachments(self.ctxt, + mock_volume_mgr, + fake_volume) + + self.assertEqual(len(fake_attachments), mock_log.exception.call_count) + @ddt.data(KeyError, exception.VolumeNotFound) - def test_cleanup_temp_volumes_snapshots_volume_not_found( - self, err, mock_volume_get): + def test_cleanup_temp_volumes_snapshots_for_one_backup_volume_not_found( + self, err): """Ensure we handle missing volume for a backup.""" + + mock_volume_get = self.mock_object(db, 'volume_get') mock_volume_get.side_effect = [err] - backup1 = self._create_backup_db_entry(status='creating') - backups = [backup1] + backup = self._create_backup_db_entry(status='creating') - self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots( - backups)) + self.assertIsNone( + self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup( + self.ctxt, + backup)) - @mock.patch.object(lvm.LVMVolumeDriver, 'delete_snapshot') - def test_cleanup_temp_snapshot_not_found(self, - mock_delete_snapshot): + def test_cleanup_temp_snapshot_for_one_backup_not_found(self): """Ensure we handle missing temp snapshot for a backup.""" + mock_delete_snapshot = self.mock_object( + lvm.LVMVolumeDriver, 'delete_snapshot') + vol1_id = self._create_volume_db_entry() self._create_volume_attach(vol1_id) db.volume_update(self.ctxt, vol1_id, {'status': 'backing-up'}) - backup1 = self._create_backup_db_entry(status='error', - volume_id=vol1_id, - temp_snapshot_id='fake') - backups = [backup1] - self.assertEqual('fake', backups[0].temp_snapshot_id) - self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots( - backups)) + backup = self._create_backup_db_entry(status='error', + volume_id=vol1_id, + temp_snapshot_id='fake') + + self.assertIsNone( + self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup( + self.ctxt, + backup)) + self.assertFalse(mock_delete_snapshot.called) - self.assertIsNone(backups[0].temp_snapshot_id) - backup1.destroy() + self.assertIsNone(backup.temp_snapshot_id) + + backup.destroy() db.volume_destroy(self.ctxt, vol1_id) - @mock.patch.object(lvm.LVMVolumeDriver, 'delete_volume') - def test_cleanup_temp_volume_not_found(self, - mock_delete_volume): + def test_cleanup_temp_volume_for_one_backup_not_found(self): """Ensure we handle missing temp volume for a backup.""" + mock_delete_volume = self.mock_object( + lvm.LVMVolumeDriver, 'delete_volume') + vol1_id = self._create_volume_db_entry() self._create_volume_attach(vol1_id) db.volume_update(self.ctxt, vol1_id, {'status': 'backing-up'}) - backup1 = self._create_backup_db_entry(status='error', - volume_id=vol1_id, - temp_volume_id='fake') - backups = [backup1] - self.assertEqual('fake', backups[0].temp_volume_id) - self.assertIsNone(self.backup_mgr._cleanup_temp_volumes_snapshots( - backups)) + backup = self._create_backup_db_entry(status='error', + volume_id=vol1_id, + temp_volume_id='fake') + + self.assertIsNone( + self.backup_mgr._cleanup_temp_volumes_snapshots_for_one_backup( + self.ctxt, + backup)) + self.assertFalse(mock_delete_volume.called) - self.assertIsNone(backups[0].temp_volume_id) - backup1.destroy() + self.assertIsNone(backup.temp_volume_id) + + backup.destroy() db.volume_destroy(self.ctxt, vol1_id) def test_create_backup_with_bad_volume_status(self): -- 2.45.2