From: Mudassir Latif <mudassir.latif@purestorage.com>
Date: Wed, 2 Sep 2015 22:07:04 +0000 (-0700)
Subject: Allow replicated volumes to be recoverable
X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=519175176032e6c45149b7e234b035ad2f2e9abb;p=openstack-build%2Fcinder-build.git

Allow replicated volumes to be recoverable

Currently, once they are in ‘error’ or ‘failed-over’ states you can’t
do anything with them. We need to allow enable_replication after the
admin has resolved any issues that may have prevented the operation.

Same thing for after a fail-over, you cannot enable replication on it.

This change allows for more states to be recoverable via enable
replication. It also more uniformly sets the state to ‘error’ when
something bad happens.

'enabled' --> replication is on.
'disabled' --> replication is off by design.
'failed-over' --> we have just failed over. Replication is off.
'error' --> an error occurred during the last operation.

allowed flow is:
'enabled' --> 'failed-over'/'disabled'/'error'
'error' --> 'enabled'/'error'
'disabled' --> 'enabled'/'error'
'failed-over' --> 'enabled'/'error'

Change-Id: Icbc22b8501b048b1755019305430177ba0a9385f
Partial-Bug: #1491631
---

diff --git a/cinder/tests/unit/test_replication.py b/cinder/tests/unit/test_replication.py
index 830465d75..1f0bc33c3 100644
--- a/cinder/tests/unit/test_replication.py
+++ b/cinder/tests/unit/test_replication.py
@@ -30,9 +30,9 @@ from cinder.volume import driver
 CONF = cfg.CONF
 
 
-class VolumeReplicationTestCase(test.TestCase):
+class VolumeReplicationTestCaseBase(test.TestCase):
     def setUp(self):
-        super(VolumeReplicationTestCase, self).setUp()
+        super(VolumeReplicationTestCaseBase, self).setUp()
         self.ctxt = context.RequestContext('user', 'fake', False)
         self.adm_ctxt = context.RequestContext('admin', 'fake', True)
         self.manager = importutils.import_object(CONF.volume_manager)
@@ -42,6 +42,8 @@ class VolumeReplicationTestCase(test.TestCase):
                                                 spec=driver.VolumeDriver)
         self.driver = self.driver_patcher.start()
 
+
+class VolumeReplicationTestCase(VolumeReplicationTestCaseBase):
     @mock.patch('cinder.utils.require_driver_initialized')
     def test_promote_replica_uninit_driver(self, _init):
         """Test promote replication when driver is not initialized."""
@@ -111,3 +113,183 @@ class VolumeReplicationTestCase(test.TestCase):
                           self.manager.reenable_replication,
                           self.adm_ctxt,
                           vol['id'])
+
+
+class VolumeManagerReplicationV2Tests(VolumeReplicationTestCaseBase):
+    mock_driver = None
+    mock_db = None
+    vol = None
+
+    def setUp(self):
+        super(VolumeManagerReplicationV2Tests, self).setUp()
+        self.mock_db = mock.Mock()
+        self.mock_driver = mock.Mock()
+        self.vol = test_utils.create_volume(self.ctxt,
+                                            status='available',
+                                            replication_status='enabling')
+        self.manager.driver = self.mock_driver
+        self.mock_driver.replication_enable.return_value = \
+            {'replication_status': 'enabled'}
+        self.mock_driver.replication_disable.return_value = \
+            {'replication_status': 'disabled'}
+        self.manager.db = self.mock_db
+        self.mock_db.volume_get.return_value = self.vol
+        self.mock_db.volume_update.return_value = self.vol
+
+    # enable_replication tests
+    @mock.patch('cinder.utils.require_driver_initialized')
+    def test_enable_replication_uninitialized_driver(self,
+                                                     mock_require_driver_init):
+        mock_require_driver_init.side_effect = exception.DriverNotInitialized
+        self.assertRaises(exception.DriverNotInitialized,
+                          self.manager.enable_replication,
+                          self.ctxt,
+                          self.vol)
+
+    def test_enable_replication_error_state(self):
+        self.vol['replication_status'] = 'error'
+        self.assertRaises(exception.InvalidVolume,
+                          self.manager.enable_replication,
+                          self.ctxt,
+                          self.vol)
+
+    def test_enable_replication_driver_raises_cinder_exception(self):
+        self.mock_driver.replication_enable.side_effect = \
+            exception.CinderException
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.manager.enable_replication,
+                          self.ctxt,
+                          self.vol)
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'error'})
+
+    def test_enable_replication_driver_raises_exception(self):
+        self.mock_driver.replication_enable.side_effect = Exception
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.manager.enable_replication,
+                          self.ctxt,
+                          self.vol)
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'error'})
+
+    def test_enable_replication_success(self):
+        self.manager.enable_replication(self.ctxt, self.vol)
+        # volume_update is called multiple times
+        self.mock_db.volume_update.side_effect = [self.vol, self.vol]
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'enabled'})
+
+    # disable_replication tests
+    @mock.patch('cinder.utils.require_driver_initialized')
+    def test_disable_replication_uninitialized_driver(self,
+                                                      mock_req_driver_init):
+        mock_req_driver_init.side_effect = exception.DriverNotInitialized
+        self.assertRaises(exception.DriverNotInitialized,
+                          self.manager.disable_replication,
+                          self.ctxt,
+                          self.vol)
+
+    def test_disable_replication_error_state(self):
+        self.vol['replication_status'] = 'error'
+        self.assertRaises(exception.InvalidVolume,
+                          self.manager.disable_replication,
+                          self.ctxt,
+                          self.vol)
+
+    def test_disable_replication_driver_raises_cinder_exception(self):
+        self.vol['replication_status'] = 'disabling'
+        self.mock_driver.replication_disable.side_effect = \
+            exception.CinderException
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.manager.disable_replication,
+                          self.ctxt,
+                          self.vol)
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'error'})
+
+    def test_disable_replication_driver_raises_exception(self):
+        self.vol['replication_status'] = 'disabling'
+        self.mock_driver.replication_disable.side_effect = Exception
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.manager.disable_replication,
+                          self.ctxt,
+                          self.vol)
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'error'})
+
+    def test_disable_replication_success(self):
+        self.vol['replication_status'] = 'disabling'
+        self.manager.disable_replication(self.ctxt, self.vol)
+        # volume_update is called multiple times
+        self.mock_db.volume_update.side_effect = [self.vol, self.vol]
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'disabled'})
+
+    # failover_replication tests
+    @mock.patch('cinder.utils.require_driver_initialized')
+    def test_failover_replication_uninitialized_driver(self,
+                                                       mock_driver_init):
+        self.vol['replication_status'] = 'enabling_secondary'
+        # validate that driver is called even if uninitialized
+        mock_driver_init.side_effect = exception.DriverNotInitialized
+        self.manager.failover_replication(self.ctxt, self.vol)
+        # volume_update is called multiple times
+        self.mock_db.volume_update.side_effect = [self.vol, self.vol]
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'failed-over'})
+
+    def test_failover_replication_error_state(self):
+        self.vol['replication_status'] = 'error'
+        self.assertRaises(exception.InvalidVolume,
+                          self.manager.failover_replication,
+                          self.ctxt,
+                          self.vol)
+
+    def test_failover_replication_driver_raises_cinder_exception(self):
+        self.vol['replication_status'] = 'enabling_secondary'
+        self.mock_driver.replication_failover.side_effect = \
+            exception.CinderException
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.manager.failover_replication,
+                          self.ctxt,
+                          self.vol)
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'error'})
+
+    def test_failover_replication_driver_raises_exception(self):
+        self.vol['replication_status'] = 'enabling_secondary'
+        self.mock_driver.replication_failover.side_effect = Exception
+        self.assertRaises(exception.VolumeBackendAPIException,
+                          self.manager.failover_replication,
+                          self.ctxt,
+                          self.vol)
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'error'})
+
+    def test_failover_replication_success(self):
+        self.vol['replication_status'] = 'enabling_secondary'
+        self.manager.failover_replication(self.ctxt, self.vol)
+        # volume_update is called multiple times
+        self.mock_db.volume_update.side_effect = [self.vol, self.vol]
+        self.mock_db.volume_update.assert_called_with(
+            self.ctxt,
+            self.vol['id'],
+            {'replication_status': 'failed-over'})
diff --git a/cinder/volume/api.py b/cinder/volume/api.py
index 830febb3f..fc161a9f4 100644
--- a/cinder/volume/api.py
+++ b/cinder/volume/api.py
@@ -1611,7 +1611,7 @@ class API(base.Base):
 
     #  Replication V2 methods ##
 
-    # NOTE(jdg): It might be kinda silly to propogate the named
+    # NOTE(jdg): It might be kinda silly to propagate the named
     # args with defaults all the way down through rpc into manager
     # but for now the consistency is useful, and there may be
     # some usefulness in the future (direct calls in manager?)
@@ -1641,7 +1641,7 @@ class API(base.Base):
         # call to driver regardless of replication_status, most likely
         # this indicates an issue with the driver, but might be useful
         # cases to  consider modifying this for in the future.
-        valid_rep_status = ['disabled']
+        valid_rep_status = ['disabled', 'failed-over', 'error']
         rep_status = volume.get('replication_status', valid_rep_status[0])
 
         if rep_status not in valid_rep_status:
diff --git a/cinder/volume/driver.py b/cinder/volume/driver.py
index b5ecd979c..210d978e4 100644
--- a/cinder/volume/driver.py
+++ b/cinder/volume/driver.py
@@ -1672,7 +1672,7 @@ class ReplicaV2VD(object):
 
         The replication_driver_data response is vendor unique,
         data returned/used by the driver.  It is expected that
-        the reponse from the driver is in the appropriate db update
+        the response from the driver is in the appropriate db update
         format, in the form of a dict, where the vendor data is
         stored under the key 'replication_driver_data'
 
@@ -1689,10 +1689,10 @@ class ReplicaV2VD(object):
         """Disable replication on the specified volume.
 
         If the specified volume is currently replication enabled,
-        this method can be used to disable the replciation process
+        this method can be used to disable the replication process
         on the backend.
 
-        Note that we still send this call to a driver whos volume
+        Note that we still send this call to a driver whose volume
         may report replication-disabled already.  We do this as a
         safety mechanism to allow a driver to cleanup any mismatch
         in state between Cinder and itself.
@@ -1708,7 +1708,7 @@ class ReplicaV2VD(object):
 
         The replication_driver_data response is vendor unique,
         data returned/used by the driver.  It is expected that
-        the reponse from the driver is in the appropriate db update
+        the response from the driver is in the appropriate db update
         format, in the form of a dict, where the vendor data is
         stored under the key 'replication_driver_data'
 
@@ -1734,7 +1734,7 @@ class ReplicaV2VD(object):
         the replication target is a configured cinder backend, we'll
         just update the host column for the volume.
 
-        Very important point here is that in the case of a succesful
+        Very important point here is that in the case of a successful
         failover, we want to update the replication_status of the
         volume to "failed-over".  This way there's an indication that
         things worked as expected, and that it's evident that the volume
@@ -1744,7 +1744,7 @@ class ReplicaV2VD(object):
         :param context: security context
         :param volume: volume object returned by DB
         :param secondary: Specifies rep target to fail over to
-        :response: dict of udpates
+        :response: dict of updates
 
         So the response would take the form:
             {host: <properly formatted host string for db update>,
@@ -1780,7 +1780,7 @@ class ReplicaV2VD(object):
 
         Example response for replicating to an unmanaged backend:
         {'volume_id': volume['id'],
-         'targets':[{'type': 'managed',
+         'targets':[{'type': 'unmanaged',
                      'vendor-key-1': 'value-1'}...]
 
         NOTE: It's the responsibility of the driver to mask out any
diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py
index b9fd6b980..7fb57d4be 100644
--- a/cinder/volume/manager.py
+++ b/cinder/volume/manager.py
@@ -3197,8 +3197,19 @@ class VolumeManager(manager.SchedulerDependentManager):
         except exception.CinderException:
             err_msg = (_("Enable replication for volume failed."))
             LOG.exception(err_msg, resource=volume)
+            self.db.volume_update(context,
+                                  volume['id'],
+                                  {'replication_status': 'error'})
             raise exception.VolumeBackendAPIException(data=err_msg)
 
+        except Exception:
+            msg = _('enable_replication caused exception in driver.')
+            LOG.exception(msg, resource=volume)
+            self.db.volume_update(context,
+                                  volume['id'],
+                                  {'replication_status': 'error'})
+            raise exception.VolumeBackendAPIException(data=msg)
+
         try:
             if rep_driver_data:
                 volume = self.db.volume_update(context,
@@ -3246,7 +3257,19 @@ class VolumeManager(manager.SchedulerDependentManager):
         except exception.CinderException:
             err_msg = (_("Disable replication for volume failed."))
             LOG.exception(err_msg, resource=volume)
+            self.db.volume_update(context,
+                                  volume['id'],
+                                  {'replication_status': 'error'})
             raise exception.VolumeBackendAPIException(data=err_msg)
+
+        except Exception:
+            msg = _('disable_replication caused exception in driver.')
+            LOG.exception(msg, resource=volume)
+            self.db.volume_update(context,
+                                  volume['id'],
+                                  {'replication_status': 'error'})
+            raise exception.VolumeBackendAPIException(msg)
+
         try:
             if rep_driver_data:
                 volume = self.db.volume_update(context,
@@ -3255,6 +3278,9 @@ class VolumeManager(manager.SchedulerDependentManager):
         except exception.CinderException as ex:
             LOG.exception(_LE("Driver replication data update failed."),
                           resource=volume)
+            self.db.volume_update(context,
+                                  volume['id'],
+                                  {'replication_status': 'error'})
             raise exception.VolumeBackendAPIException(reason=ex)
         self.db.volume_update(context,
                               volume['id'],
@@ -3288,6 +3314,13 @@ class VolumeManager(manager.SchedulerDependentManager):
         # not being able to talk to the primary array that it's configured
         # to manage.
 
+        if volume['replication_status'] != 'enabling_secondary':
+            msg = (_("Unable to failover replication due to invalid "
+                     "replication status: %(status)s.") %
+                   {'status': volume['replication_status']})
+            LOG.error(msg, resource=volume)
+            raise exception.InvalidVolume(reason=msg)
+
         try:
             volume = self.db.volume_get(context, volume['id'])
             model_update = self.driver.replication_failover(context,
@@ -3320,6 +3353,14 @@ class VolumeManager(manager.SchedulerDependentManager):
                                   {'replication_status': 'error'})
             raise exception.VolumeBackendAPIException(data=err_msg)
 
+        except Exception:
+            msg = _('replication_failover caused exception in driver.')
+            LOG.exception(msg, resource=volume)
+            self.db.volume_update(context,
+                                  volume['id'],
+                                  {'replication_status': 'error'})
+            raise exception.VolumeBackendAPIException(msg)
+
         if model_update:
             try:
                 volume = self.db.volume_update(
@@ -3330,12 +3371,15 @@ class VolumeManager(manager.SchedulerDependentManager):
             except exception.CinderException as ex:
                 LOG.exception(_LE("Driver replication data update failed."),
                               resource=volume)
+                self.db.volume_update(context,
+                                      volume['id'],
+                                      {'replication_status': 'error'})
                 raise exception.VolumeBackendAPIException(reason=ex)
 
         # NOTE(jdg): We're setting replication status to failed-over
-        # which indicates the volume is ok, things went as epected but
+        # which indicates the volume is ok, things went as expected but
         # we're likely not replicating any longer because... well we
-        # did a fail-over.  In the case of admin brining primary
+        # did a fail-over.  In the case of admin bringing primary
         # back online he/she can use enable_replication to get this
         # state set back to enabled.
 
diff --git a/doc/source/devref/replication.rst b/doc/source/devref/replication.rst
index bd1371e67..14475891b 100644
--- a/doc/source/devref/replication.rst
+++ b/doc/source/devref/replication.rst
@@ -208,7 +208,7 @@ This may be used for triggering a fail-over manually or for testing purposes.
 
 Note that ideally drivers will know how to update the volume reference properly so that Cinder is now
 pointing to the secondary.  Also, while it's not required, at this time; ideally the command would
-act as a toggle, allowing to switch back and forth betweeen primary and secondary and back to primary.
+act as a toggle, allowing to switch back and forth between primary and secondary and back to primary.
 
 **list_replication_targets**