From: Alan Jiang Date: Thu, 3 Oct 2013 22:03:09 +0000 (-0500) Subject: long flashcopy operation may block volume service X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=8a2a3d691fa54c07d14b3e32558641f43b69c040;p=openstack-build%2Fcinder-build.git long flashcopy operation may block volume service Storwize family uses flashcopy for snapshot or volume clone. The volume delete has to wait until flashcopy finishes or errors out. The _delete_vdisk() will poll volume FlashCopy status in a loop. This may block volume serivce heartheat since it is in the same . The solution is to use openstack FixedIntervalLoopingCall to run the FlashCopy status poll in a timer thread. The cinder volume mananger will resume delete operation for those volumes that are in the deleting state during volume service startup. Since Storwize volume delete may wait for a long time, this can cause volume service to have long delay before it becomes available. A greenpool is used to offload those volume delete operations. Change-Id: Ie01a441a327e1e318fa8da0040ae130731b7a686 Closes-Bug: #1203152 (cherry picked from commit 7aa4f65a8c17aa037deff0f5b534ed694c17e62a) --- diff --git a/cinder/tests/test_storwize_svc.py b/cinder/tests/test_storwize_svc.py index 2ae4298cf..76edce207 100644 --- a/cinder/tests/test_storwize_svc.py +++ b/cinder/tests/test_storwize_svc.py @@ -39,6 +39,7 @@ from cinder.volume import configuration as conf from cinder.volume.drivers import storwize_svc from cinder.volume import volume_types +from eventlet import greenthread LOG = logging.getLogger(__name__) @@ -1165,7 +1166,7 @@ port_speed!N/A 'no']) for d in to_delete: - del self._fcmappings_list[k] + del self._fcmappings_list[d] return self._print_info_cmd(rows=rows, **kwargs) @@ -1485,6 +1486,8 @@ class StorwizeSVCDriverTestCase(test.TestCase): self.driver.do_setup(None) self.driver.check_for_setup_error() self.stubs.Set(storwize_svc.time, 'sleep', lambda s: None) + self.stubs.Set(greenthread, 'sleep', lambda *x, **y: None) + self.stubs.Set(storwize_svc, 'CHECK_FCMAPPING_INTERVAL', 0) def _set_flag(self, flag, value): group = self.driver.configuration.config_group diff --git a/cinder/volume/drivers/storwize_svc.py b/cinder/volume/drivers/storwize_svc.py index ee5fc6dde..51984bcc1 100644 --- a/cinder/volume/drivers/storwize_svc.py +++ b/cinder/volume/drivers/storwize_svc.py @@ -51,6 +51,7 @@ from cinder import context from cinder import exception from cinder.openstack.common import excutils from cinder.openstack.common import log as logging +from cinder.openstack.common import loopingcall from cinder.openstack.common import processutils from cinder.openstack.common import strutils from cinder import utils @@ -112,6 +113,8 @@ storwize_svc_opts = [ CONF = cfg.CONF CONF.register_opts(storwize_svc_opts) +CHECK_FCMAPPING_INTERVAL = 300 + class StorwizeSVCDriver(san.SanDriver): """IBM Storwize V7000 and SVC iSCSI/FC volume driver. @@ -1243,55 +1246,73 @@ class StorwizeSVCDriver(san.SanDriver): return True def _ensure_vdisk_no_fc_mappings(self, name, allow_snaps=True): - # Ensure vdisk has no FlashCopy mappings + """Ensure vdisk has no flashcopy mappings.""" + timer = loopingcall.FixedIntervalLoopingCall( + self._check_vdisk_fc_mappings, name, allow_snaps) + # Create a timer greenthread. The default volume service heart + # beat is every 10 seconds. The flashcopy usually takes hours + # before it finishes. Don't set the sleep interval shorter + # than the heartbeat. Otherwise volume service heartbeat + # will not be serviced. + LOG.debug(_('Calling _ensure_vdisk_no_fc_mappings: vdisk %s') + % name) + ret = timer.start(interval=CHECK_FCMAPPING_INTERVAL).wait() + timer.stop() + return ret + + def _check_vdisk_fc_mappings(self, name, allow_snaps=True): + """FlashCopy mapping check helper.""" + + LOG.debug(_('Loopcall: _check_vdisk_fc_mappings(), vdisk %s') % name) mapping_ids = self._get_vdisk_fc_mappings(name) - while len(mapping_ids): - wait_for_copy = False - for map_id in mapping_ids: - attrs = self._get_flashcopy_mapping_attributes(map_id) - if not attrs: - continue - source = attrs['source_vdisk_name'] - target = attrs['target_vdisk_name'] - copy_rate = attrs['copy_rate'] - status = attrs['status'] - - if copy_rate == '0': - # Case #2: A vdisk that has snapshots - if source == name: - if not allow_snaps: - return False - ssh_cmd = ['svctask', 'chfcmap', '-copyrate', '50', - '-autodelete', 'on', map_id] - out, err = self._run_ssh(ssh_cmd) - wait_for_copy = True - # Case #3: A snapshot - else: - msg = (_('Vdisk %(name)s not involved in ' - 'mapping %(src)s -> %(tgt)s') % - {'name': name, 'src': source, 'tgt': target}) - self._driver_assert(target == name, msg) - if status in ['copying', 'prepared']: - self._run_ssh(['svctask', 'stopfcmap', map_id]) - elif status in ['stopping', 'preparing']: - wait_for_copy = True - else: - self._run_ssh(['svctask', 'rmfcmap', '-force', - map_id]) - # Case 4: Copy in progress - wait and will autodelete + wait_for_copy = False + for map_id in mapping_ids: + attrs = self._get_flashcopy_mapping_attributes(map_id) + if not attrs: + continue + source = attrs['source_vdisk_name'] + target = attrs['target_vdisk_name'] + copy_rate = attrs['copy_rate'] + status = attrs['status'] + + if copy_rate == '0': + # Case #2: A vdisk that has snapshots. Return + # False if snapshot is not allowed. + if source == name: + if not allow_snaps: + raise loopingcall.LoopingCallDone(retvalue=False) + ssh_cmd = ['svctask', 'chfcmap', '-copyrate', '50', + '-autodelete', 'on', map_id] + out, err = self._run_ssh(ssh_cmd) + wait_for_copy = True + # Case #3: A snapshot else: - if status == 'prepared': + msg = (_('Vdisk %(name)s not involved in ' + 'mapping %(src)s -> %(tgt)s') % + {'name': name, 'src': source, 'tgt': target}) + self._driver_assert(target == name, msg) + if status in ['copying', 'prepared']: self._run_ssh(['svctask', 'stopfcmap', map_id]) - self._run_ssh(['svctask', 'rmfcmap', '-force', map_id]) - elif status == 'idle_or_copied': - # Prepare failed - self._run_ssh(['svctask', 'rmfcmap', '-force', map_id]) - else: + # Need to wait for the fcmap to change to + # stopped state before remove fcmap + wait_for_copy = True + elif status in ['stopping', 'preparing']: wait_for_copy = True - if wait_for_copy: - time.sleep(5) - mapping_ids = self._get_vdisk_fc_mappings(name) - return True + else: + self._run_ssh(['svctask', 'rmfcmap', '-force', + map_id]) + # Case 4: Copy in progress - wait and will autodelete + else: + if status == 'prepared': + self._run_ssh(['svctask', 'stopfcmap', map_id]) + self._run_ssh(['svctask', 'rmfcmap', '-force', map_id]) + elif status == 'idle_or_copied': + # Prepare failed + self._run_ssh(['svctask', 'rmfcmap', '-force', map_id]) + else: + wait_for_copy = True + if not wait_for_copy or not len(mapping_ids): + raise loopingcall.LoopingCallDone(retvalue=True) def _delete_vdisk(self, name, force): """Deletes existing vdisks. diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py index be3d11a55..dda762858 100644 --- a/cinder/volume/manager.py +++ b/cinder/volume/manager.py @@ -64,6 +64,8 @@ from cinder.volume import volume_types from cinder.taskflow import states +from eventlet.greenpool import GreenPool + LOG = logging.getLogger(__name__) QUOTAS = quota.QUOTAS @@ -76,6 +78,10 @@ volume_manager_opts = [ default=300, help='Timeout for creating the volume to migrate to ' 'when performing volume migration (seconds)'), + cfg.BoolOpt('volume_service_inithost_offload', + default=False, + help='Offload pending volume delete during ' + 'volume service startup'), ] CONF = cfg.CONF @@ -144,6 +150,8 @@ class VolumeManager(manager.SchedulerDependentManager): *args, **kwargs) self.configuration = Configuration(volume_manager_opts, config_group=service_name) + self._tp = GreenPool() + if not volume_driver: # Get from configuration, which will get the default # if its not using the multi backend @@ -165,6 +173,9 @@ class VolumeManager(manager.SchedulerDependentManager): configuration=self.configuration, db=self.db) + def _add_to_threadpool(self, func, *args, **kwargs): + self._tp.spawn_n(func, *args, **kwargs) + def init_host(self): """Do any initialization that needs to be run if this is a standalone service. @@ -208,7 +219,15 @@ class VolumeManager(manager.SchedulerDependentManager): for volume in volumes: if volume['status'] == 'deleting': LOG.info(_('Resuming delete on volume: %s') % volume['id']) - self.delete_volume(ctxt, volume['id']) + if CONF.volume_service_inithost_offload: + # Offload all the pending volume delete operations to the + # threadpool to prevent the main volume service thread + # from being blocked. + self._add_to_threadpool(self.delete_volume(ctxt, + volume['id'])) + else: + # By default, delete volumes sequentially + self.delete_volume(ctxt, volume['id']) # collect and publish service capabilities self.publish_service_capabilities(ctxt) diff --git a/etc/cinder/cinder.conf.sample b/etc/cinder/cinder.conf.sample index df1d31f10..af0cf84ae 100644 --- a/etc/cinder/cinder.conf.sample +++ b/etc/cinder/cinder.conf.sample @@ -1760,6 +1760,10 @@ # performing volume migration (seconds) (integer value) #migration_create_volume_timeout_secs=300 +# Offload pending volume delete during volume service startup +# (boolean value) +#volume_service_inithost_offload=false + # # Options defined in cinder.volume.utils @@ -1770,4 +1774,4 @@ #volume_dd_blocksize=1M -# Total option count: 381 +# Total option count: 382