From 43029941219505bc3861dd285e3393e08c76b433 Mon Sep 17 00:00:00 2001 From: "Walter A. Boring IV" Date: Fri, 27 Mar 2015 15:34:40 -0700 Subject: [PATCH] Brick: Fix race in removing iSCSI device Some folks are seeing race conditions on overloaded or slow machines, where the time between issuing the remove volume command, and the volume being from /dev/disk/by-path by the kernel isn't long enough. This patch adds a simple verify loop for iSCSI only. It will try 3 times and wait 2 seconds between tries. Also, This patch will only fix attaches that happen in cinder for copy volume <--> image operations. Nova needs a separate patch to fix any potential issues there. Change-Id: I5db9744da6b865b7d903503db8a895b409f32f5c Closes-Bug: 1437441 --- cinder/brick/exception.py | 4 +++ cinder/brick/initiator/connector.py | 3 ++ cinder/brick/initiator/linuxscsi.py | 34 +++++++++++++++++++++- cinder/tests/brick/test_brick_linuxscsi.py | 13 +++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/cinder/brick/exception.py b/cinder/brick/exception.py index 3078b7ece..0283c1567 100644 --- a/cinder/brick/exception.py +++ b/cinder/brick/exception.py @@ -97,6 +97,10 @@ class VolumeDeviceNotFound(BrickException): message = _("Volume device not found at %(device)s.") +class VolumePathNotRemoved(BrickException): + message = _("Volume path %(volume_path)s was not removed in time.") + + class VolumeGroupNotFound(BrickException): message = _('Unable to find Volume Group: %(vg_name)s') diff --git a/cinder/brick/initiator/connector.py b/cinder/brick/initiator/connector.py index ba469d622..6c7b56464 100644 --- a/cinder/brick/initiator/connector.py +++ b/cinder/brick/initiator/connector.py @@ -402,6 +402,9 @@ class ISCSIConnector(InitiatorConnector): dev_name = self._linuxscsi.get_name_from_path(host_device) if dev_name: self._linuxscsi.remove_scsi_device(dev_name) + # make sure the volume is gone before moving on + # Bug #1437441 - Race removing iscsi device + self._linuxscsi.wait_for_volume_removal(host_device) # NOTE(vish): Only disconnect from the target if no luns from the # target are in use. diff --git a/cinder/brick/initiator/linuxscsi.py b/cinder/brick/initiator/linuxscsi.py index 9fae5a5ff..aa1272c08 100644 --- a/cinder/brick/initiator/linuxscsi.py +++ b/cinder/brick/initiator/linuxscsi.py @@ -22,8 +22,10 @@ import re from oslo_concurrency import processutils as putils from oslo_log import log as logging +from cinder.brick import exception from cinder.brick import executor -from cinder.i18n import _, _LW +from cinder.i18n import _, _LW, _LE +from cinder.openstack.common import loopingcall LOG = logging.getLogger(__name__) @@ -65,6 +67,36 @@ class LinuxSCSI(executor.Executor): LOG.debug("Remove SCSI device(%s) with %s" % (device, path)) self.echo_scsi_command(path, "1") + def wait_for_volume_removal(self, volume_path): + """This is used to ensure that volumes are gone.""" + + def _wait_for_volume_removal(volume_path): + LOG.debug("Waiting for SCSI mount point %s to be removed.", + volume_path) + if os.path.exists(volume_path): + if self.tries >= self.scan_attempts: + msg = _LE("Exceeded the number of attempts to detect " + "volume removal.") + LOG.error(msg) + raise exception.VolumePathNotRemoved( + volume_path=volume_path) + + LOG.debug("%(path)s still exists, rescanning. Try number: " + "%(tries)s", + {'path': volume_path, 'tries': self.tries}) + self.tries = self.tries + 1 + else: + LOG.debug("SCSI mount point %s has been removed.", volume_path) + raise loopingcall.LoopingCallDone() + + # Setup a loop here to give the kernel time + # to remove the volume from /dev/disk/by-path/ + self.tries = 0 + self.scan_attempts = 3 + timer = loopingcall.FixedIntervalLoopingCall( + _wait_for_volume_removal, volume_path) + timer.start(interval=2).wait() + def get_device_info(self, device): (out, _err) = self._execute('sg_scan', device, run_as_root=True, root_helper=self._root_helper) diff --git a/cinder/tests/brick/test_brick_linuxscsi.py b/cinder/tests/brick/test_brick_linuxscsi.py index ed08d7fe7..cac64b2e8 100644 --- a/cinder/tests/brick/test_brick_linuxscsi.py +++ b/cinder/tests/brick/test_brick_linuxscsi.py @@ -17,6 +17,7 @@ import string from oslo_log import log as logging +from cinder.brick import exception from cinder.brick.initiator import linuxscsi from cinder import test @@ -62,6 +63,18 @@ class LinuxSCSITestCase(test.TestCase): ('tee -a /sys/block/sdc/device/delete')] self.assertEqual(expected_commands, self.cmds) + def test_wait_for_volume_removal(self): + fake_path = '/dev/disk/by-path/fake-iscsi-iqn-lun-0' + self.stubs.Set(os.path, "exists", lambda x: True) + self.assertRaises(exception.VolumePathNotRemoved, + self.linuxscsi.wait_for_volume_removal, + fake_path) + + self.stubs.Set(os.path, "exists", lambda x: False) + self.linuxscsi.wait_for_volume_removal(fake_path) + expected_commands = [] + self.assertEqual(expected_commands, self.cmds) + def test_flush_multipath_device(self): self.linuxscsi.flush_multipath_device('/dev/dm-9') expected_commands = [('multipath -f /dev/dm-9')] -- 2.45.2