From: Jeegn Chen Date: Tue, 10 Feb 2015 03:26:24 +0000 (+0800) Subject: More error handling on EMC VNX migration failure X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=6713e8f26deda134179ee6d6a53c1354223b300d;p=openstack-build%2Fcinder-build.git More error handling on EMC VNX migration failure If a LUN migration session is stopped or faulted after started, current implementation of VNX Cinder Driver only wait for timeout and no logic will clean up the broken migration session. This patch adds logic to detect and clean up the stopped/faulted migration session. Change-Id: If66109ca45fce303390c4144b4120e75f1aae138 Closes-Bug: #1420075 --- diff --git a/cinder/tests/test_emc_vnxdirect.py b/cinder/tests/test_emc_vnxdirect.py index e1bd14a13..91c4919da 100644 --- a/cinder/tests/test_emc_vnxdirect.py +++ b/cinder/tests/test_emc_vnxdirect.py @@ -439,6 +439,26 @@ class EMCVNXCLIDriverTestData(): NDU_LIST_RESULT_WO_LICENSE = ( "Name of the software package: -Unisphere ", 0) + MIGRATE_PROPERTY_MIGRATING = """\ + Source LU Name: volume-f6247ae1-8e1c-4927-aa7e-7f8e272e5c3d + Source LU ID: 63950 + Dest LU Name: volume-f6247ae1-8e1c-4927-aa7e-7f8e272e5c3d_dest + Dest LU ID: 136 + Migration Rate: high + Current State: MIGRATING + Percent Complete: 50 + Time Remaining: 0 second(s) + """ + MIGRATE_PROPERTY_STOPPED = """\ + Source LU Name: volume-f6247ae1-8e1c-4927-aa7e-7f8e272e5c3d + Source LU ID: 63950 + Dest LU Name: volume-f6247ae1-8e1c-4927-aa7e-7f8e272e5c3d_dest + Dest LU ID: 136 + Migration Rate: high + Current State: STOPPED - Destination full + Percent Complete: 60 + Time Remaining: 0 second(s) + """ def SNAP_MP_CREATE_CMD(self, name='vol1', source='vol1'): return ('lun', '-create', '-type', 'snap', '-primaryLunName', @@ -482,6 +502,9 @@ class EMCVNXCLIDriverTestData(): def MIGRATION_VERIFY_CMD(self, src_id): return ("migrate", "-list", "-source", src_id) + def MIGRATION_CANCEL_CMD(self, src_id): + return ("migrate", "-cancel", "-source", src_id, '-o') + def GETPORT_CMD(self): return ("connection", "-getport", "-address", "-vlanid") @@ -1469,6 +1492,52 @@ Time Remaining: 0 second(s) poll=True)] fake_cli.assert_has_calls(expect_cmd) + @mock.patch("cinder.volume.drivers.emc.emc_vnx_cli." + "CommandLineHelper.create_lun_by_cmd", + mock.Mock( + return_value={'lun_id': 1})) + @mock.patch( + "cinder.volume.drivers.emc.emc_vnx_cli.EMCVnxCliBase.get_lun_id", + mock.Mock( + side_effect=[1, 1])) + @mock.patch( + "cinder.volume.drivers.emc.emc_vnx_cli.EMCVnxCliBase." + "get_lun_id_by_name", + mock.Mock(return_value=1)) + def test_volume_migration_stopped(self): + + commands = [self.testData.MIGRATION_CMD(), + self.testData.MIGRATION_VERIFY_CMD(1), + self.testData.MIGRATION_CANCEL_CMD(1)] + + results = [SUCCEED, [(self.testData.MIGRATE_PROPERTY_MIGRATING, 0), + (self.testData.MIGRATE_PROPERTY_STOPPED, 0), + ('The specified source LUN is not ' + 'currently migrating', 23)], + SUCCEED] + fake_cli = self.driverSetup(commands, results) + fake_host = {'capabilities': {'location_info': + "unit_test_pool2|fakeSerial", + 'storage_protocol': 'iSCSI'}} + + self.assertRaisesRegexp(exception.VolumeBackendAPIException, + "Migration of LUN 1 has been stopped or" + " faulted.", + self.driver.migrate_volume, + None, self.testData.test_volume, fake_host) + + expect_cmd = [mock.call(*self.testData.MIGRATION_CMD(), + retry_disable=True, + poll=True), + mock.call(*self.testData.MIGRATION_VERIFY_CMD(1), + poll=True), + mock.call(*self.testData.MIGRATION_VERIFY_CMD(1), + poll=False), + mock.call(*self.testData.MIGRATION_CANCEL_CMD(1)), + mock.call(*self.testData.MIGRATION_VERIFY_CMD(1), + poll=False)] + fake_cli.assert_has_calls(expect_cmd) + def test_create_destroy_volume_snapshot(self): fake_cli = self.driverSetup() @@ -1930,11 +1999,17 @@ Time Remaining: 0 second(s) cmd_detach_lun = ('lun', '-detach', '-name', 'vol2') output_migrate = ("", 0) cmd_migrate_verify = self.testData.MIGRATION_VERIFY_CMD(1) + output_migrate_verify = (r'The specified source LUN ' + 'is not currently migrating', 23) + cmd_migrate_cancel = self.testData.MIGRATION_CANCEL_CMD(1) + output_migrate_cancel = ("", 0) commands = [cmd_dest, cmd_dest_np, cmd_migrate, - cmd_migrate_verify] + cmd_migrate_verify, cmd_migrate_cancel] results = [output_dest, output_dest, output_migrate, - FAKE_ERROR_RETURN] + [FAKE_ERROR_RETURN, output_migrate_verify], + output_migrate_cancel] + fake_cli = self.driverSetup(commands, results) self.assertRaises(exception.VolumeBackendAPIException, @@ -1962,6 +2037,9 @@ Time Remaining: 0 second(s) poll=True), mock.call(*self.testData.MIGRATION_VERIFY_CMD(1), poll=True), + mock.call(*self.testData.MIGRATION_CANCEL_CMD(1)), + mock.call(*self.testData.MIGRATION_VERIFY_CMD(1), + poll=False), mock.call(*self.testData.LUN_DELETE_CMD('vol2_dest')), mock.call(*cmd_detach_lun), mock.call(*self.testData.LUN_DELETE_CMD('vol2'))] diff --git a/cinder/volume/drivers/emc/emc_vnx_cli.py b/cinder/volume/drivers/emc/emc_vnx_cli.py index afd7d4c63..54750e382 100644 --- a/cinder/volume/drivers/emc/emc_vnx_cli.py +++ b/cinder/volume/drivers/emc/emc_vnx_cli.py @@ -257,9 +257,11 @@ class CommandLineHelper(object): CLI_RESP_PATTERN_LUN_NOT_EXIST = 'The (pool lun) may not exist' CLI_RESP_PATTERN_SMP_NOT_ATTACHED = ('The specified Snapshot mount point ' 'is not currently attached.') - CLI_RESP_PATTERN_SG_NAME_IN_USE = "Storage Group name already in use" - CLI_RESP_PATTERN_LUN_IN_SG_1 = "contained in a Storage Group" - CLI_RESP_PATTERN_LUN_IN_SG_2 = "Host LUN/LUN mapping still exists" + CLI_RESP_PATTERN_SG_NAME_IN_USE = 'Storage Group name already in use' + CLI_RESP_PATTERN_LUN_IN_SG_1 = 'contained in a Storage Group' + CLI_RESP_PATTERN_LUN_IN_SG_2 = 'Host LUN/LUN mapping still exists' + CLI_RESP_PATTERN_LUN_NOT_MIGRATING = ('The specified source LUN ' + 'is not currently migrating') def __init__(self, configuration): configuration.append_config_values(san.san_opts) @@ -900,30 +902,76 @@ class CommandLineHelper(object): LOG.debug("Migration output: %s", out) if rc == 0: # parse the percentage - out = re.split(r'\n', out) - log = "Migration in process %s %%." % out[7].split(": ")[1] - LOG.debug(log) + state = re.search(r'Current State:\s*([^\n]+)', out) + percentage = re.search(r'Percent Complete:\s*([^\n]+)', out) + if state is not None: + current_state = state.group(1) + percentage_complete = percentage.group(1) + else: + self._raise_cli_error(cmd_migrate_list, rc, out) + if ("FAULTED" in current_state or + "STOPPED" in current_state): + reason = _("Migration of LUN %s has been stopped or" + " faulted.") % src_id + raise exception.VolumeBackendAPIException(data=reason) + if ("TRANSITIONING" in current_state or + "MIGRATING" in current_state): + LOG.debug("Migration of LUN %(src_id)s in process " + "%(percentage)s %%.", + {"src_id": src_id, + "percentage": percentage_complete}) else: - if re.search(r'The specified source LUN ' - 'is not currently migrating', out): + if re.search(self.CLI_RESP_PATTERN_LUN_NOT_MIGRATING, out): LOG.debug("Migration of LUN %s is finished.", src_id) mig_ready = True else: - reason = _("Querying migrating status error.") - LOG.error(reason) - raise exception.VolumeBackendAPIException( - data="%(reason)s : %(output)s" % - {'reason': reason, 'output': out}) + self._raise_cli_error(cmd_migrate_list, rc, out) return mig_ready + def migration_disappeared(poll=False): + cmd_migrate_list = ('migrate', '-list', '-source', src_id) + out, rc = self.command_execute(*cmd_migrate_list, + poll=poll) + if rc != 0: + if re.search(self.CLI_RESP_PATTERN_LUN_NOT_MIGRATING, out): + LOG.debug("Migration of LUN %s is finished.", src_id) + return True + else: + LOG.error(_LE("Failed to query migration status of LUN."), + src_id) + self._raise_cli_error(cmd_migrate_list, rc, out) + return False + eventlet.sleep(INTERVAL_30_SEC) - if migration_is_ready(True): - return True - self._wait_for_a_condition(migration_is_ready, - interval=INTERVAL_30_SEC) + + try: + if migration_is_ready(True): + return True + self._wait_for_a_condition( + migration_is_ready, + interval=INTERVAL_30_SEC, + ignorable_exception_arbiter=lambda ex: + type(ex) is not exception.VolumeBackendAPIException) + # Migration cancellation for clean up + except exception.VolumeBackendAPIException: + with excutils.save_and_reraise_exception(): + LOG.error(_LE("Migration of LUN %s failed to complete."), + src_id) + self.migration_cancel(src_id) + self._wait_for_a_condition(migration_disappeared, + interval=INTERVAL_30_SEC) return True + # Cancel migration in case where status is faulted or stopped + def migration_cancel(self, src_id): + LOG.info(_LI("Cancelling Migration from LUN %s."), src_id) + cmd_migrate_cancel = ('migrate', '-cancel', '-source', src_id, + '-o') + out, rc = self.command_execute(*cmd_migrate_cancel) + if rc != 0: + self._raise_cli_error(cmd_migrate_cancel, rc, out) + def get_storage_group(self, name, poll=True): # ALU/HLU as key/value map @@ -1604,7 +1652,7 @@ class CommandLineHelper(object): class EMCVnxCliBase(object): """This class defines the functions to use the native CLI functionality.""" - VERSION = '05.03.04' + VERSION = '05.03.05' stats = {'driver_version': VERSION, 'storage_protocol': None, 'vendor_name': 'EMC',