From 519175176032e6c45149b7e234b035ad2f2e9abb Mon Sep 17 00:00:00 2001 From: Mudassir Latif Date: Wed, 2 Sep 2015 15:07:04 -0700 Subject: [PATCH] Allow replicated volumes to be recoverable MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Currently, once they are in ‘error’ or ‘failed-over’ states you can’t do anything with them. We need to allow enable_replication after the admin has resolved any issues that may have prevented the operation. Same thing for after a fail-over, you cannot enable replication on it. This change allows for more states to be recoverable via enable replication. It also more uniformly sets the state to ‘error’ when something bad happens. 'enabled' --> replication is on. 'disabled' --> replication is off by design. 'failed-over' --> we have just failed over. Replication is off. 'error' --> an error occurred during the last operation. allowed flow is: 'enabled' --> 'failed-over'/'disabled'/'error' 'error' --> 'enabled'/'error' 'disabled' --> 'enabled'/'error' 'failed-over' --> 'enabled'/'error' Change-Id: Icbc22b8501b048b1755019305430177ba0a9385f Partial-Bug: #1491631 --- cinder/tests/unit/test_replication.py | 186 +++++++++++++++++++++++++- cinder/volume/api.py | 4 +- cinder/volume/driver.py | 14 +- cinder/volume/manager.py | 48 ++++++- doc/source/devref/replication.rst | 2 +- 5 files changed, 240 insertions(+), 14 deletions(-) diff --git a/cinder/tests/unit/test_replication.py b/cinder/tests/unit/test_replication.py index 830465d75..1f0bc33c3 100644 --- a/cinder/tests/unit/test_replication.py +++ b/cinder/tests/unit/test_replication.py @@ -30,9 +30,9 @@ from cinder.volume import driver CONF = cfg.CONF -class VolumeReplicationTestCase(test.TestCase): +class VolumeReplicationTestCaseBase(test.TestCase): def setUp(self): - super(VolumeReplicationTestCase, self).setUp() + super(VolumeReplicationTestCaseBase, self).setUp() self.ctxt = context.RequestContext('user', 'fake', False) self.adm_ctxt = context.RequestContext('admin', 'fake', True) self.manager = importutils.import_object(CONF.volume_manager) @@ -42,6 +42,8 @@ class VolumeReplicationTestCase(test.TestCase): spec=driver.VolumeDriver) self.driver = self.driver_patcher.start() + +class VolumeReplicationTestCase(VolumeReplicationTestCaseBase): @mock.patch('cinder.utils.require_driver_initialized') def test_promote_replica_uninit_driver(self, _init): """Test promote replication when driver is not initialized.""" @@ -111,3 +113,183 @@ class VolumeReplicationTestCase(test.TestCase): self.manager.reenable_replication, self.adm_ctxt, vol['id']) + + +class VolumeManagerReplicationV2Tests(VolumeReplicationTestCaseBase): + mock_driver = None + mock_db = None + vol = None + + def setUp(self): + super(VolumeManagerReplicationV2Tests, self).setUp() + self.mock_db = mock.Mock() + self.mock_driver = mock.Mock() + self.vol = test_utils.create_volume(self.ctxt, + status='available', + replication_status='enabling') + self.manager.driver = self.mock_driver + self.mock_driver.replication_enable.return_value = \ + {'replication_status': 'enabled'} + self.mock_driver.replication_disable.return_value = \ + {'replication_status': 'disabled'} + self.manager.db = self.mock_db + self.mock_db.volume_get.return_value = self.vol + self.mock_db.volume_update.return_value = self.vol + + # enable_replication tests + @mock.patch('cinder.utils.require_driver_initialized') + def test_enable_replication_uninitialized_driver(self, + mock_require_driver_init): + mock_require_driver_init.side_effect = exception.DriverNotInitialized + self.assertRaises(exception.DriverNotInitialized, + self.manager.enable_replication, + self.ctxt, + self.vol) + + def test_enable_replication_error_state(self): + self.vol['replication_status'] = 'error' + self.assertRaises(exception.InvalidVolume, + self.manager.enable_replication, + self.ctxt, + self.vol) + + def test_enable_replication_driver_raises_cinder_exception(self): + self.mock_driver.replication_enable.side_effect = \ + exception.CinderException + self.assertRaises(exception.VolumeBackendAPIException, + self.manager.enable_replication, + self.ctxt, + self.vol) + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'error'}) + + def test_enable_replication_driver_raises_exception(self): + self.mock_driver.replication_enable.side_effect = Exception + self.assertRaises(exception.VolumeBackendAPIException, + self.manager.enable_replication, + self.ctxt, + self.vol) + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'error'}) + + def test_enable_replication_success(self): + self.manager.enable_replication(self.ctxt, self.vol) + # volume_update is called multiple times + self.mock_db.volume_update.side_effect = [self.vol, self.vol] + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'enabled'}) + + # disable_replication tests + @mock.patch('cinder.utils.require_driver_initialized') + def test_disable_replication_uninitialized_driver(self, + mock_req_driver_init): + mock_req_driver_init.side_effect = exception.DriverNotInitialized + self.assertRaises(exception.DriverNotInitialized, + self.manager.disable_replication, + self.ctxt, + self.vol) + + def test_disable_replication_error_state(self): + self.vol['replication_status'] = 'error' + self.assertRaises(exception.InvalidVolume, + self.manager.disable_replication, + self.ctxt, + self.vol) + + def test_disable_replication_driver_raises_cinder_exception(self): + self.vol['replication_status'] = 'disabling' + self.mock_driver.replication_disable.side_effect = \ + exception.CinderException + self.assertRaises(exception.VolumeBackendAPIException, + self.manager.disable_replication, + self.ctxt, + self.vol) + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'error'}) + + def test_disable_replication_driver_raises_exception(self): + self.vol['replication_status'] = 'disabling' + self.mock_driver.replication_disable.side_effect = Exception + self.assertRaises(exception.VolumeBackendAPIException, + self.manager.disable_replication, + self.ctxt, + self.vol) + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'error'}) + + def test_disable_replication_success(self): + self.vol['replication_status'] = 'disabling' + self.manager.disable_replication(self.ctxt, self.vol) + # volume_update is called multiple times + self.mock_db.volume_update.side_effect = [self.vol, self.vol] + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'disabled'}) + + # failover_replication tests + @mock.patch('cinder.utils.require_driver_initialized') + def test_failover_replication_uninitialized_driver(self, + mock_driver_init): + self.vol['replication_status'] = 'enabling_secondary' + # validate that driver is called even if uninitialized + mock_driver_init.side_effect = exception.DriverNotInitialized + self.manager.failover_replication(self.ctxt, self.vol) + # volume_update is called multiple times + self.mock_db.volume_update.side_effect = [self.vol, self.vol] + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'failed-over'}) + + def test_failover_replication_error_state(self): + self.vol['replication_status'] = 'error' + self.assertRaises(exception.InvalidVolume, + self.manager.failover_replication, + self.ctxt, + self.vol) + + def test_failover_replication_driver_raises_cinder_exception(self): + self.vol['replication_status'] = 'enabling_secondary' + self.mock_driver.replication_failover.side_effect = \ + exception.CinderException + self.assertRaises(exception.VolumeBackendAPIException, + self.manager.failover_replication, + self.ctxt, + self.vol) + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'error'}) + + def test_failover_replication_driver_raises_exception(self): + self.vol['replication_status'] = 'enabling_secondary' + self.mock_driver.replication_failover.side_effect = Exception + self.assertRaises(exception.VolumeBackendAPIException, + self.manager.failover_replication, + self.ctxt, + self.vol) + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'error'}) + + def test_failover_replication_success(self): + self.vol['replication_status'] = 'enabling_secondary' + self.manager.failover_replication(self.ctxt, self.vol) + # volume_update is called multiple times + self.mock_db.volume_update.side_effect = [self.vol, self.vol] + self.mock_db.volume_update.assert_called_with( + self.ctxt, + self.vol['id'], + {'replication_status': 'failed-over'}) diff --git a/cinder/volume/api.py b/cinder/volume/api.py index 830febb3f..fc161a9f4 100644 --- a/cinder/volume/api.py +++ b/cinder/volume/api.py @@ -1611,7 +1611,7 @@ class API(base.Base): # Replication V2 methods ## - # NOTE(jdg): It might be kinda silly to propogate the named + # NOTE(jdg): It might be kinda silly to propagate the named # args with defaults all the way down through rpc into manager # but for now the consistency is useful, and there may be # some usefulness in the future (direct calls in manager?) @@ -1641,7 +1641,7 @@ class API(base.Base): # call to driver regardless of replication_status, most likely # this indicates an issue with the driver, but might be useful # cases to consider modifying this for in the future. - valid_rep_status = ['disabled'] + valid_rep_status = ['disabled', 'failed-over', 'error'] rep_status = volume.get('replication_status', valid_rep_status[0]) if rep_status not in valid_rep_status: diff --git a/cinder/volume/driver.py b/cinder/volume/driver.py index b5ecd979c..210d978e4 100644 --- a/cinder/volume/driver.py +++ b/cinder/volume/driver.py @@ -1672,7 +1672,7 @@ class ReplicaV2VD(object): The replication_driver_data response is vendor unique, data returned/used by the driver. It is expected that - the reponse from the driver is in the appropriate db update + the response from the driver is in the appropriate db update format, in the form of a dict, where the vendor data is stored under the key 'replication_driver_data' @@ -1689,10 +1689,10 @@ class ReplicaV2VD(object): """Disable replication on the specified volume. If the specified volume is currently replication enabled, - this method can be used to disable the replciation process + this method can be used to disable the replication process on the backend. - Note that we still send this call to a driver whos volume + Note that we still send this call to a driver whose volume may report replication-disabled already. We do this as a safety mechanism to allow a driver to cleanup any mismatch in state between Cinder and itself. @@ -1708,7 +1708,7 @@ class ReplicaV2VD(object): The replication_driver_data response is vendor unique, data returned/used by the driver. It is expected that - the reponse from the driver is in the appropriate db update + the response from the driver is in the appropriate db update format, in the form of a dict, where the vendor data is stored under the key 'replication_driver_data' @@ -1734,7 +1734,7 @@ class ReplicaV2VD(object): the replication target is a configured cinder backend, we'll just update the host column for the volume. - Very important point here is that in the case of a succesful + Very important point here is that in the case of a successful failover, we want to update the replication_status of the volume to "failed-over". This way there's an indication that things worked as expected, and that it's evident that the volume @@ -1744,7 +1744,7 @@ class ReplicaV2VD(object): :param context: security context :param volume: volume object returned by DB :param secondary: Specifies rep target to fail over to - :response: dict of udpates + :response: dict of updates So the response would take the form: {host: , @@ -1780,7 +1780,7 @@ class ReplicaV2VD(object): Example response for replicating to an unmanaged backend: {'volume_id': volume['id'], - 'targets':[{'type': 'managed', + 'targets':[{'type': 'unmanaged', 'vendor-key-1': 'value-1'}...] NOTE: It's the responsibility of the driver to mask out any diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py index b9fd6b980..7fb57d4be 100644 --- a/cinder/volume/manager.py +++ b/cinder/volume/manager.py @@ -3197,8 +3197,19 @@ class VolumeManager(manager.SchedulerDependentManager): except exception.CinderException: err_msg = (_("Enable replication for volume failed.")) LOG.exception(err_msg, resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) raise exception.VolumeBackendAPIException(data=err_msg) + except Exception: + msg = _('enable_replication caused exception in driver.') + LOG.exception(msg, resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) + raise exception.VolumeBackendAPIException(data=msg) + try: if rep_driver_data: volume = self.db.volume_update(context, @@ -3246,7 +3257,19 @@ class VolumeManager(manager.SchedulerDependentManager): except exception.CinderException: err_msg = (_("Disable replication for volume failed.")) LOG.exception(err_msg, resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) raise exception.VolumeBackendAPIException(data=err_msg) + + except Exception: + msg = _('disable_replication caused exception in driver.') + LOG.exception(msg, resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) + raise exception.VolumeBackendAPIException(msg) + try: if rep_driver_data: volume = self.db.volume_update(context, @@ -3255,6 +3278,9 @@ class VolumeManager(manager.SchedulerDependentManager): except exception.CinderException as ex: LOG.exception(_LE("Driver replication data update failed."), resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) raise exception.VolumeBackendAPIException(reason=ex) self.db.volume_update(context, volume['id'], @@ -3288,6 +3314,13 @@ class VolumeManager(manager.SchedulerDependentManager): # not being able to talk to the primary array that it's configured # to manage. + if volume['replication_status'] != 'enabling_secondary': + msg = (_("Unable to failover replication due to invalid " + "replication status: %(status)s.") % + {'status': volume['replication_status']}) + LOG.error(msg, resource=volume) + raise exception.InvalidVolume(reason=msg) + try: volume = self.db.volume_get(context, volume['id']) model_update = self.driver.replication_failover(context, @@ -3320,6 +3353,14 @@ class VolumeManager(manager.SchedulerDependentManager): {'replication_status': 'error'}) raise exception.VolumeBackendAPIException(data=err_msg) + except Exception: + msg = _('replication_failover caused exception in driver.') + LOG.exception(msg, resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) + raise exception.VolumeBackendAPIException(msg) + if model_update: try: volume = self.db.volume_update( @@ -3330,12 +3371,15 @@ class VolumeManager(manager.SchedulerDependentManager): except exception.CinderException as ex: LOG.exception(_LE("Driver replication data update failed."), resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) raise exception.VolumeBackendAPIException(reason=ex) # NOTE(jdg): We're setting replication status to failed-over - # which indicates the volume is ok, things went as epected but + # which indicates the volume is ok, things went as expected but # we're likely not replicating any longer because... well we - # did a fail-over. In the case of admin brining primary + # did a fail-over. In the case of admin bringing primary # back online he/she can use enable_replication to get this # state set back to enabled. diff --git a/doc/source/devref/replication.rst b/doc/source/devref/replication.rst index bd1371e67..14475891b 100644 --- a/doc/source/devref/replication.rst +++ b/doc/source/devref/replication.rst @@ -208,7 +208,7 @@ This may be used for triggering a fail-over manually or for testing purposes. Note that ideally drivers will know how to update the volume reference properly so that Cinder is now pointing to the secondary. Also, while it's not required, at this time; ideally the command would -act as a toggle, allowing to switch back and forth betweeen primary and secondary and back to primary. +act as a toggle, allowing to switch back and forth between primary and secondary and back to primary. **list_replication_targets** -- 2.45.2