From fe538dc63de2c9bab21416a4b6c920049442b4e9 Mon Sep 17 00:00:00 2001 From: John Griffith Date: Thu, 9 Jul 2015 21:11:54 +0000 Subject: [PATCH] Cinder replication V2 This adds a scaled back replication implementation that leaves the bulk of the work up to the driver. We just provide basic admin API methods to do things like enable/disable and fail-over. Set up and specification of replication targets for a specific back end are now intended to be part of the cinder.conf in the driver section itself. Replication targets are configured via the cinder.conf file in their associated driver section. See the devref doc included in this commit for details on the format. The next step in configuration is to create a volume-type with replication info in the extra-specs. extra-specs = replication=enable, volume_backend_name=foo This instructs the driver to utilize replication, default is up to the driver but would suggest single way rep and in the case of multiple targets, driver could choose or have a default. If the back end doesn't report replication=enabled in it's stats updates the scheduler will fail to place the volume due to invalid host, or no hosts available. Vendors can easily modify extra-specs or their own config settings to modify this behavior, any vendor-unique adaptation can be provided through the use of scoped keys. Suggested examples will be published in docs. See doc/source/devref/replication.rst for more info Implements BP: replication-v2 DocImpact Change-Id: I406390e4d5f3c9947df1c4f2de68821e0fd7f75b --- cinder/api/contrib/admin_actions.py | 79 +++++++++ cinder/tests/unit/policy.json | 5 +- cinder/tests/unit/test_volume.py | 55 ++++++ cinder/tests/unit/test_volume_utils.py | 8 + cinder/volume/api.py | 115 +++++++++++++ cinder/volume/driver.py | 197 +++++++++++++++++++++ cinder/volume/manager.py | 226 ++++++++++++++++++++++++- cinder/volume/rpcapi.py | 29 +++- cinder/volume/utils.py | 25 +++ doc/source/devref/index.rst | 1 + doc/source/devref/replication.rst | 166 ++++++++++++++++++ etc/cinder/policy.json | 5 + 12 files changed, 908 insertions(+), 3 deletions(-) create mode 100644 doc/source/devref/replication.rst diff --git a/cinder/api/contrib/admin_actions.py b/cinder/api/contrib/admin_actions.py index 46bb3fd8a..cb665d287 100644 --- a/cinder/api/contrib/admin_actions.py +++ b/cinder/api/contrib/admin_actions.py @@ -255,6 +255,85 @@ class VolumeAdminController(AdminController): new_volume, error) return {'save_volume_id': ret} + @wsgi.action('os-enable_replication') + def _enable_replication(self, req, id, body): + """Enable/Re-enable replication on replciation capable volume. + + Admin only method, used primarily for cases like disable/re-enable + replication proces on a replicated volume for maintenance or testing + """ + + context = req.environ['cinder.context'] + self.authorize(context, 'enable_replication') + try: + volume = self._get(context, id) + except exception.VolumeNotFound as e: + raise exc.HTTPNotFound(explanation=e.msg) + self.volume_api.enable_replication(context, volume) + return webob.Response(status_int=202) + + @wsgi.action('os-disable_replication') + def _disable_replication(self, req, id, body): + """Disable replication on replciation capable volume. + + Admin only method, used to instruct a backend to + disable replication process to a replicated volume. + """ + + context = req.environ['cinder.context'] + self.authorize(context, 'disable_replication') + try: + volume = self._get(context, id) + except exception.VolumeNotFound as e: + raise exc.HTTPNotFound(explanation=e.msg) + self.volume_api.disable_replication(context, volume) + return webob.Response(status_int=202) + + @wsgi.action('os-failover_replication') + def _failover_replication(self, req, id, body): + """Failover a replicating volume to it's secondary + + Admin only method, used to force a fail-over to + a replication target. Optional secondary param to + indicate what device to promote in case of multiple + replication targets. + """ + + context = req.environ['cinder.context'] + self.authorize(context, 'failover_replication') + try: + volume = self._get(context, id) + except exception.VolumeNotFound as e: + raise exc.HTTPNotFound(explanation=e.msg) + secondary = body['os-failover_replication'].get('secondary', None) + self.volume_api.failover_replication(context, volume, secondary) + return webob.Response(status_int=202) + + @wsgi.action('os-list_replication_targets') + def _list_replication_targets(self, req, id, body): + """Show replication targets for the specified host. + + Admin only method, used to display configured + replication target devices for the specified volume. + + """ + + # TODO(jdg): We'll want an equivalent type of command + # to querie a backend host (show configuration for a + # specified backend), but priority here is for + # a volume as it's likely to be more useful. + context = req.environ['cinder.context'] + self.authorize(context, 'list_replication_targets') + try: + volume = self._get(context, id) + except exception.VolumeNotFound as e: + raise exc.HTTPNotFound(explanation=e.msg) + + # Expected response is a dict is a dict with unkonwn + # keys. Should be of the form: + # {'volume_id': xx, 'replication_targets':[{k: v, k1: v1...}]} + return self.volume_api.list_replication_targets(context, volume) + class SnapshotAdminController(AdminController): """AdminController for Snapshots.""" diff --git a/cinder/tests/unit/policy.json b/cinder/tests/unit/policy.json index e656d0685..0948d3dd0 100644 --- a/cinder/tests/unit/policy.json +++ b/cinder/tests/unit/policy.json @@ -34,7 +34,10 @@ "volume:update_readonly_flag": "", "volume:retype": "", "volume:copy_volume_to_image": "", - + "volume:enable_replication": "rule:admin_api", + "volume:disable_replication": "rule:admin_api", + "volume:failover_replication": "rule:admin_api", + "volume:list_replication_targets": "rule:admin_api", "volume_extension:volume_admin_actions:reset_status": "rule:admin_api", "volume_extension:snapshot_admin_actions:reset_status": "rule:admin_api", "volume_extension:backup_admin_actions:reset_status": "rule:admin_api", diff --git a/cinder/tests/unit/test_volume.py b/cinder/tests/unit/test_volume.py index 07486b627..256c27209 100644 --- a/cinder/tests/unit/test_volume.py +++ b/cinder/tests/unit/test_volume.py @@ -5849,6 +5849,61 @@ class GenericVolumeDriverTestCase(DriverTestCase): volume_file) self.assertEqual(i, backup_service.restore.call_count) + def test_enable_replication_invalid_state(self): + volume_api = cinder.volume.api.API() + ctxt = context.get_admin_context() + volume = tests_utils.create_volume(ctxt, + size=1, + host=CONF.host, + replication_status='enabled') + + self.assertRaises(exception.InvalidVolume, + volume_api.enable_replication, + ctxt, volume) + + def test_enable_replication(self): + volume_api = cinder.volume.api.API() + ctxt = context.get_admin_context() + + volume = tests_utils.create_volume(self.context, + size=1, + host=CONF.host, + replication_status='disabled') + with mock.patch.object(volume_rpcapi.VolumeAPI, + 'enable_replication') as mock_enable_rep: + volume_api.enable_replication(ctxt, volume) + self.assertTrue(mock_enable_rep.called) + + def test_disable_replication_invalid_state(self): + volume_api = cinder.volume.api.API() + ctxt = context.get_admin_context() + volume = tests_utils.create_volume(ctxt, + size=1, + host=CONF.host, + replication_status='invalid-state') + + self.assertRaises(exception.InvalidVolume, + volume_api.disable_replication, + ctxt, volume) + + def test_disable_replication(self): + volume_api = cinder.volume.api.API() + ctxt = context.get_admin_context() + + volume = tests_utils.create_volume(self.context, + size=1, + host=CONF.host, + replication_status='disabled') + + with mock.patch.object(volume_rpcapi.VolumeAPI, + 'disable_replication') as mock_disable_rep: + volume_api.disable_replication(ctxt, volume) + self.assertTrue(mock_disable_rep.called) + + volume['replication_status'] = 'enabled' + volume_api.disable_replication(ctxt, volume) + self.assertTrue(mock_disable_rep.called) + class LVMISCSIVolumeDriverTestCase(DriverTestCase): """Test case for VolumeDriver""" diff --git a/cinder/tests/unit/test_volume_utils.py b/cinder/tests/unit/test_volume_utils.py index 5502c33ba..41b1a0add 100644 --- a/cinder/tests/unit/test_volume_utils.py +++ b/cinder/tests/unit/test_volume_utils.py @@ -794,3 +794,11 @@ class VolumeUtilsTestCase(test.TestCase): mock_db, 'volume-d8cd1fe') self.assertFalse(result) + + def test_convert_config_string_to_dict(self): + test_string = "{'key-1'='val-1' 'key-2'='val-2' 'key-3'='val-3'}" + expected_dict = {'key-1': 'val-1', 'key-2': 'val-2', 'key-3': 'val-3'} + + self.assertEqual( + expected_dict, + volume_utils.convert_config_string_to_dict(test_string)) diff --git a/cinder/volume/api.py b/cinder/volume/api.py index c06895aed..af64a6353 100644 --- a/cinder/volume/api.py +++ b/cinder/volume/api.py @@ -1505,6 +1505,121 @@ class API(base.Base): resource=vol_ref) return vol_ref + # Replication V2 methods ## + + # NOTE(jdg): It might be kinda silly to propogate the named + # args with defaults all the way down through rpc into manager + # but for now the consistency is useful, and there may be + # some usefulness in the future (direct calls in manager?) + + # NOTE(jdg): Relying solely on the volume-type quota mechanism + # need to consider looking at how we handle configured backends + # WRT quotas, do they count against normal quotas or not? For + # now they're a special resource, so no. + + @wrap_check_policy + def enable_replication(self, ctxt, volume): + + # NOTE(jdg): details like sync vs async + # and replica count are to be set via the + # volume-type and config files. + + # Get a fresh ref from db and check status + volume = self.db.volume_get(ctxt, volume['id']) + + # NOTE(jdg): Set a valid status as a var to minimize errors via typos + # also, use a list, we may want to add to it some day + + # TODO(jdg): Move these up to a global list for each call and ban the + # free form typing of states and state checks going forward + + # NOTE(jdg): There may be a need for some backends to allow this + # call to driver regardless of replication_status, most likely + # this indicates an issue with the driver, but might be useful + # cases to consider modifying this for in the future. + valid_rep_status = ['disabled'] + rep_status = volume.get('replication_status', valid_rep_status[0]) + + if rep_status not in valid_rep_status: + msg = (_("Invalid status to enable replication. " + "valid states are: %(valid_states)s, " + "current replication-state is: %(curr_state)s."), + {'valid_states': valid_rep_status, + 'curr_state': rep_status}) + + raise exception.InvalidVolume(reason=msg) + + vref = self.db.volume_update(ctxt, + volume['id'], + {'replication_status': 'enabling'}) + self.volume_rpcapi.enable_replication(ctxt, vref) + + @wrap_check_policy + def disable_replication(self, ctxt, volume): + + valid_disable_status = ['disabled', 'enabled'] + + # NOTE(jdg): Just use disabled here (item 1 in the list) this + # way if someone says disable_rep on a volume that's not being + # replicated we just say "ok, done" + rep_status = volume.get('replication_status', valid_disable_status[0]) + + if rep_status not in valid_disable_status: + msg = (_("Invalid status to disable replication. " + "valid states are: %(valid_states)s, " + "current replication-state is: %(curr_state)s."), + {'valid_states': valid_disable_status, + 'curr_state': rep_status}) + + raise exception.InvalidVolume(reason=msg) + + vref = self.db.volume_update(ctxt, + volume['id'], + {'replication_status': 'disabling'}) + + self.volume_rpcapi.disable_replication(ctxt, vref) + + @wrap_check_policy + def failover_replication(self, + ctxt, + volume, + secondary=None): + + # FIXME(jdg): What is the secondary argument? + # for managed secondaries that's easy; it's a host + # for others, it's tricky; will propose a format for + # secondaries that includes an ID/Name that can be + # used as a handle + valid_failover_status = ['enabled'] + rep_status = volume.get('replication_status', 'na') + + if rep_status not in valid_failover_status: + msg = (_("Invalid status to failover replication. " + "valid states are: %(valid_states)s, " + "current replication-state is: %(curr_state)s."), + {'valid_states': valid_failover_status, + 'curr_state': rep_status}) + + raise exception.InvalidVolume(reason=msg) + + vref = self.db.volume_update( + ctxt, + volume['id'], + {'replication_status': 'enabling_secondary'}) + + self.volume_rpcapi.failover_replication(ctxt, + vref, + secondary) + + @wrap_check_policy + def list_replication_targets(self, ctxt, volume): + + # NOTE(jdg): This collects info for the specified volume + # it is NOT an error if the volume is not being replicated + # also, would be worth having something at a backend/host + # level to show an admin how a backend is configured. + return self.volume_rpcapi.list_replication_targets(ctxt, volume) + class HostAPI(base.Base): def __init__(self): diff --git a/cinder/volume/driver.py b/cinder/volume/driver.py index 59983b3a5..7f6c8646e 100644 --- a/cinder/volume/driver.py +++ b/cinder/volume/driver.py @@ -221,6 +221,20 @@ volume_opts = [ help='List of options that control which trace info ' 'is written to the DEBUG log level to assist ' 'developers. Valid values are method and api.'), + cfg.BoolOpt('managed_replication_target', + default=True, + help='There are two types of target configurations ' + 'managed (replicate to another configured backend) ' + 'or unmanaged (replicate to a device not managed ' + 'by Cinder).'), + cfg.ListOpt('replication_devices', + default=None, + help="List of k/v pairs representing a replication target " + "for this backend device. For unmanaged the format " + "is: {'key-1'='val1' 'key-2'='val2'...},{...} " + "and for managed devices its simply a list of valid " + "configured backend_names that the driver supports " + "replicating to: backend-a,bakcend-b...") ] # for backward compatibility @@ -291,6 +305,7 @@ class BaseVD(object): self.configuration.append_config_values(volume_opts) self.configuration.append_config_values(iser_opts) utils.setup_tracing(self.configuration.safe_get('trace_flags')) + self.set_execute(execute) self._stats = {} @@ -1384,6 +1399,187 @@ class ManageableVD(object): pass +@six.add_metaclass(abc.ABCMeta) +class ReplicaV2VD(object): + """Cinder replication functionality. + + The Cinder replication functionality is set up primarily through + the use of volume-types in conjunction with the filter scheduler. + This requires: + 1. The driver reports "replication = True" in it's capabilities + 2. The cinder.conf file includes the valid_replication_devices section + + The driver configuration is expected to take one of the following two + forms, see devref replication docs for details. + + Note we provide cinder.volume.utils.convert_config_string_to_dict + to parse this out into a usable proper dictionary. + + """ + + @abc.abstractmethod + def replication_enable(self, context, volume): + """Enable replication on a replication capable volume. + + If the volume was created on a replication_enabled host this method + is used to re-enable replication for the volume. + + Primarily we only want this for testing/admin purposes. The idea + being that the bulk of the replication details are handled by the + type definition and the driver; however disable/enable(re-enable) is + provided for admins to test or do maintenance which is a + requirement by some cloud-providers. + + NOTE: This is intended as an ADMIN only call and is not + intended to be used by end-user to enable replication. We're + leaving that to volume-type info, this is for things like + maintenance or testing. + + + :param context: security context + :param volume: volume object returned by DB + :response: {replication_driver_data: vendor-data} DB update + + The replication_driver_data response is vendor unique, + data returned/used by the driver. It is expected that + the reponse from the driver is in the appropriate db update + format, in the form of a dict, where the vendor data is + stored under the key 'replication_driver_data' + + """ + + # TODO(jdg): Put a check in at API layer to verify the host is + # replication capable before even issuing this call (can just + # check against the volume-type for said volume as well) + + raise NotImplementedError() + + @abc.abstractmethod + def replication_disable(self, context, volume): + """Disable replication on the specified volume. + + If the specified volume is currently replication enabled, + this method can be used to disable the replciation process + on the backend. + + Note that we still send this call to a driver whos volume + may report replication-disabled already. We do this as a + safety mechanism to allow a driver to cleanup any mismatch + in state between Cinder and itself. + + This is intended as an ADMIN only call to allow for + maintenance and testing. If a driver receives this call + and the process fails for some reason the driver should + return a status update to "replication_status=disable_failed" + + :param context: security context + :param volume: volume object returned by DB + :response: {replication_driver_data: vendor-data} DB update + + The replication_driver_data response is vendor unique, + data returned/used by the driver. It is expected that + the reponse from the driver is in the appropriate db update + format, in the form of a dict, where the vendor data is + stored under the key 'replication_driver_data' + + """ + + raise NotImplementedError() + + @abc.abstractmethod + def replication_failover(self, context, volume, secondary): + """Force failover to a secondary replication target. + + Forces the failover action of a replicated volume to one of its + secondary/target devices. By default the choice of target devices + is left up to the driver. In particular we expect one way + replication here, but are providing a mechanism for 'n' way + if supported/configured. + + Currently we leave it up to the driver to figure out how/what + to do here. Rather than doing things like ID swaps, we instead + just let the driver figure out how/where to route things. + + In cases where we might want to drop a volume-service node and + the replication target is a configured cinder backend, we'll + just update the host column for the volume. + + Very important point here is that in the case of a succesful + failover, we want to update the replication_status of the + volume to "failed-over". This way there's an indication that + things worked as expected, and that it's evident that the volume + may no longer be replicating to another backend (primary burst + in to flames). This status will be set by the manager. + + :param context: security context + :param volume: volume object returned by DB + :param secondary: Specifies rep target to fail over to + :response: dict of udpates + + So the response would take the form: + {host: , + model_update: {standard_model_update_KVs}, + replication_driver_data: xxxxxxx} + + It is expected that the format of these responses are in a consumable + format to be used in a db.update call directly. + + Additionally we utilize exception catching to report back to the + manager when things went wrong and to inform the caller on how + to proceed. + + """ + + raise NotImplementedError() + + @abc.abstractmethod + def list_replication_targets(self, context, vref): + """Provide a means to obtain replication targets for a volume. + + This method is used to query a backend to get the current + replication config info for the specified volume. + + In the case of a volume that isn't being replicated, + the driver should return an empty list. + + + Example response for replicating to a managed backend: + {'volume_id': volume['id'], + 'targets':[{'type': 'managed', + 'backend_name': 'backend_name'}...] + + Example response for replicating to an unmanaged backend: + {'volume_id': volume['id'], + 'targets':[{'type': 'managed', + 'vendor-key-1': 'value-1'}...] + + NOTE: It's the responsibility of the driver to mask out any + passwords or sensitive information. Also the format of the + response allows mixed (managed/unmanaged) targets, even though + the first iteration does not support configuring the driver in + such a manner. + + """ + + raise NotImplementedError() + + @abc.abstractmethod + def get_replication_updates(self, context): + """Provide a means to obtain status updates from backend. + + Provides a concise update for backends to report any errors + or problems with replicating volumes. The intent is we only + return something here if there's an error or a problem, and to + notify where the backend thinks the volume is. + + param: context: context of caller (probably don't need) + returns: [{volid: n, status: ok|error,...}] + """ + # NOTE(jdg): flush this out with implementations so we all + # have something usable here + raise NotImplementedError() + + @six.add_metaclass(abc.ABCMeta) class ReplicaVD(object): @abc.abstractmethod @@ -1928,6 +2124,7 @@ class ISCSIDriver(VolumeDriver): data["driver_version"] = '1.0' data["storage_protocol"] = 'iSCSI' data["pools"] = [] + data["replication_enabled"] = False self._update_pools_and_stats(data) diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py index 0edb0d410..fa5a2d1ee 100644 --- a/cinder/volume/manager.py +++ b/cinder/volume/manager.py @@ -189,7 +189,7 @@ def locked_snapshot_operation(f): class VolumeManager(manager.SchedulerDependentManager): """Manages attachable block storage devices.""" - RPC_API_VERSION = '1.26' + RPC_API_VERSION = '1.27' target = messaging.Target(version=RPC_API_VERSION) @@ -405,6 +405,10 @@ class VolumeManager(manager.SchedulerDependentManager): self.publish_service_capabilities(ctxt) # conditionally run replication status task + + # FIXME(jdg): This should go away or be handled differently + # if/when we're ready for V2 replication + stats = self.driver.get_volume_stats(refresh=True) if stats and stats.get('replication', False): @@ -413,6 +417,7 @@ class VolumeManager(manager.SchedulerDependentManager): self._update_replication_relationship_status(ctxt) self.add_periodic_task(run_replication_task) + LOG.info(_LI("Driver initialization completed successfully."), resource={'type': 'driver', 'id': self.driver.__class__.__name__}) @@ -1538,6 +1543,24 @@ class VolumeManager(manager.SchedulerDependentManager): # queue it to be sent to the Schedulers. self.update_service_capabilities(volume_stats) + if volume_stats.get('replication_enabled', False): + # replciation_status provides a concise update of + # replicating volumes and any error conditions + # detected by the driver. The intent is we don't + # expect/worry about updates so long as nothing + # changes, but if something goes wrong this is a + # handy mechanism to update the manager and the db + # possibly let the admin/user be notified + + # TODO(jdg): Refactor the check/update pieces to a + # helper method we can share + # We want to leverage some of the same update model + # that we have in the targets update call + + replication_updates = self.driver.get_replication_updates() + for update in replication_updates: + pass + def _append_volume_stats(self, vol_stats): pools = vol_stats.get('pools', None) if pools and isinstance(pools, list): @@ -2706,3 +2729,204 @@ class VolumeManager(manager.SchedulerDependentManager): for key in model_update.iterkeys()} self.db.volume_update(ctxt.elevated(), new_volume['id'], model_update_new) + + # Replication V2 methods + def enable_replication(self, context, volume): + """Enable replication on a replication capable volume. + + If the volume was created on a replication_enabled host this method + is used to enable replication for the volume. Primarily used for + testing and maintenance. + + :param context: security context + :param volume: volume object returned by DB + """ + + # NOTE(jdg): We're going to do fresh get from the DB and verify that + # we are in an expected state ('enabling') + volume = self.db.volume_get(context, volume['id']) + if volume['replication_status'] != 'enabling': + raise exception.InvalidVolume() + + try: + rep_driver_data = self.driver.replication_enable(context, + volume) + except exception.CinderException: + err_msg = (_("Enable replication for volume failed.")) + LOG.exception(err_msg, resource=volume) + raise exception.VolumeBackendAPIException(data=err_msg) + try: + if rep_driver_data: + volume = self.db.volume_update(context, + volume['id'], + rep_driver_data) + except exception.CinderException as ex: + LOG.exception(_LE("Driver replication data update failed."), + resource=volume) + raise exception.VolumeBackendAPIException(reason=ex) + self.db.volume_update(context, volume['id'], + {'replication_status': 'enabled'}) + + def disable_replication(self, context, volume): + """Disable replication on the specified volume. + + If the specified volume is currently replication enabled, + this method can be used to disable the replication process + on the backend. This method assumes that we checked + replication status in the API layer to ensure we should + send this call to the driver. + + :param context: security context + :param volume: volume object returned by DB + """ + + volume = self.db.volume_get(context, volume['id']) + if volume['replication_status'] != 'disabling': + raise exception.InvalidVolume() + + try: + rep_driver_data = self.driver.replication_disable(context, + volume) + except exception.CinderException: + err_msg = (_("Disable replication for volume failed.")) + LOG.exception(err_msg, resource=volume) + raise exception.VolumeBackendAPIException(data=err_msg) + try: + if rep_driver_data: + volume = self.db.volume_update(context, + volume['id'], + rep_driver_data) + except exception.CinderException as ex: + LOG.exception(_LE("Driver replication data update failed."), + resource=volume) + raise exception.VolumeBackendAPIException(reason=ex) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'disabled'}) + + def failover_replication(self, context, volume, secondary=None): + """Force failover to a secondary replication target. + + Forces the failover action of a replicated volume to one of its + secondary/target devices. By default the choice of target devices + is left up to the driver. In particular we expect one way + replication here, but are providing a mechanism for 'n' way + if supported/configrued. + + Currently we leave it up to the driver to figure out how/what + to do here. Rather than doing things like ID swaps, we instead + just let the driver figure out how/where to route things. + + In cases where we might want to drop a volume-service node and + the replication target is a configured cinder backend, we'll + just update the host column for the volume. + + :param context: security context + :param volume: volume object returned by DB + :param secondary: Specifies rep target to fail over to + """ + try: + volume_updates = self.driver.replication_failover(context, + volume, + secondary) + + # volume_updates is a dict containing a report of relevant + # items based on the backend and how it operates or what it needs + # {'host': 'secondary-configured-cinder-backend', + # 'model_update': {'update-all-the-provider-info-etc'}, + # 'replication_driver_data': 'driver-specific-stuff-for-db'} + # Where 'host' is a valid cinder host string like + # 'foo@bar#baz' + # model_update and replication_driver_data are required + + except exception.CinderException: + + # FIXME(jdg): We need to create a few different exceptions here + # and handle each differently: + # 1. I couldn't failover, but the original setup is ok so proceed + # as if this were never called + # 2. I ran into a problem and I have no idea what state things + # are in, so set volume to error + # 3. I ran into a problem and a human needs to come fix me up + + err_msg = (_("Replication failover for volume failed.")) + LOG.exception(err_msg, resource=volume) + self.db.volume_update(context, + volume['id'], + {'replication_status': 'error'}) + raise exception.VolumeBackendAPIException(data=err_msg) + + # TODO(jdg): Come back and condense thes into a single update + update = {} + model_update = volume_updates.get('model_update', None) + driver_update = volume_updates.get('replication_driver_data', None) + host_update = volume_updates.get('host', None) + + if model_update: + update['model'] = model_update + if driver_update: + update['replication_driver_data'] = driver_update + if host_update: + update['host'] = host_update + + if update: + try: + volume = self.db.volume_update( + context, + volume['id'], + update) + + except exception.CinderException as ex: + LOG.exception(_LE("Driver replication data update failed."), + resource=volume) + raise exception.VolumeBackendAPIException(reason=ex) + + # NOTE(jdg): We're setting replication status to failed-over + # which indicates the volume is ok, things went as epected but + # we're likely not replicating any longer because... well we + # did a fail-over. In the case of admin brining primary + # back online he/she can use enable_replication to get this + # state set back to enabled. + + # Also, in the case of multiple targets, the driver can update + # status in the rep-status checks if it still has valid replication + # targets that the volume is being replicated to. + + self.db.volume_update(context, + volume['id'], + {'replication_status': 'failed-over'}) + + def list_replication_targets(self, context, volume): + """Provide a means to obtain replication targets for a volume. + + This method is used to query a backend to get the current + replication config info for the specified volume. + + In the case of a volume that isn't being replicated, + the driver should return an empty list. + + + Example response for replicating to a managed backend: + {'volume_id': volume['id'], + 'targets':[{'managed_host': 'backend_name'}...] + + Example response for replicating to an unmanaged backend: + {'volume_id': volume['id'], 'targets':[{'san_ip': '1.1.1.1', + 'san_login': 'admin'}, + ....]} + + NOTE: It's the responsibility of the driver to mask out any + passwords or sensitive information. + + """ + + try: + replication_targets = self.driver.list_replication_targets(context, + volume) + + except exception.CinderException: + err_msg = (_("Get replication targets failed.")) + LOG.exception(err_msg) + raise exception.VolumeBackendAPIException(data=err_msg) + + return replication_targets diff --git a/cinder/volume/rpcapi.py b/cinder/volume/rpcapi.py index e980b46a4..ab3c17180 100644 --- a/cinder/volume/rpcapi.py +++ b/cinder/volume/rpcapi.py @@ -72,6 +72,7 @@ class VolumeAPI(object): 1.26 - Adds support for sending objects over RPC in create_consistencygroup(), create_consistencygroup_from_src(), update_consistencygroup() and delete_consistencygroup(). + 1.27 - Adds support for replication V2 """ BASE_RPC_API_VERSION = '1.0' @@ -81,7 +82,7 @@ class VolumeAPI(object): target = messaging.Target(topic=CONF.volume_topic, version=self.BASE_RPC_API_VERSION) serializer = objects_base.CinderObjectSerializer() - self.client = rpc.get_client(target, '1.26', serializer=serializer) + self.client = rpc.get_client(target, '1.27', serializer=serializer) def create_consistencygroup(self, ctxt, group, host): new_host = utils.extract_host(host) @@ -260,3 +261,29 @@ class VolumeAPI(object): volume=volume, new_volume=new_volume, volume_status=original_volume_status) + + def enable_replication(self, ctxt, volume): + new_host = utils.extract_host(volume['host']) + cctxt = self.client.prepare(server=new_host, version='1.27') + cctxt.cast(ctxt, 'enable_replication', volume=volume) + + def disable_replication(self, ctxt, volume): + new_host = utils.extract_host(volume['host']) + cctxt = self.client.prepare(server=new_host, version='1.27') + cctxt.cast(ctxt, 'disable_replication', + volume=volume) + + def failover_replication(self, + ctxt, + volume, + secondary=None): + new_host = utils.extract_host(volume['host']) + cctxt = self.client.prepare(server=new_host, version='1.27') + cctxt.cast(ctxt, 'failover_replication', + volume=volume, + secondary=secondary) + + def list_replication_targets(self, ctxt, volume): + new_host = utils.extract_host(volume['host']) + cctxt = self.client.prepare(server=new_host, version='1.27') + return cctxt.call(ctxt, 'list_replication_targets', volume=volume) diff --git a/cinder/volume/utils.py b/cinder/volume/utils.py index c2d297bbd..893cf3bef 100644 --- a/cinder/volume/utils.py +++ b/cinder/volume/utils.py @@ -15,6 +15,7 @@ """Volume-related Utilities and helpers.""" +import ast import math import re import uuid @@ -569,3 +570,27 @@ def check_already_managed_volume(db, vol_name): except (exception.VolumeNotFound, ValueError): return False return False + + +def convert_config_string_to_dict(config_string): + """Convert config file replication string to a dict. + + The only supported form is as follows: + "{'key-1'='val-1' 'key-2'='val-2'...}" + + :param config_string: Properly formatted string to convert to dict. + :response: dict of string values + """ + + resultant_dict = {} + + try: + st = config_string.replace("=", ":") + st = st.replace(" ", ", ") + resultant_dict = ast.literal_eval(st) + except Exception: + LOG.warning(_LW("Error encountered translating config_string: " + "%(config_string)s to dict"), + {'config_string': config_string}) + + return resultant_dict diff --git a/doc/source/devref/index.rst b/doc/source/devref/index.rst index 00509ebae..86e64b578 100644 --- a/doc/source/devref/index.rst +++ b/doc/source/devref/index.rst @@ -31,6 +31,7 @@ Programming HowTos and Tutorials addmethod.openstackapi drivers gmr + replication Background Concepts for Cinder diff --git a/doc/source/devref/replication.rst b/doc/source/devref/replication.rst new file mode 100644 index 000000000..fd7fc41e3 --- /dev/null +++ b/doc/source/devref/replication.rst @@ -0,0 +1,166 @@ +Replication +============ + +How to implement replication features in a backend driver. + +For backend devices that offer replication features, Cinder +provides a common mechanism for exposing that functionality +on a volume per volume basis while still trying to allow +flexibility for the varying implementation and requirements +of all the different backend devices. + +Most of the configuration is done via the cinder.conf file +under the driver section and through the use of volume types. + +Config file examples +-------------------- + +The cinder.conf file is used to specify replication target +devices for a specific driver. There are two types of target +devices that can be configured: + + 1. Cinder Managed (represented by the volume-backend name) + 2. External devices (require vendor specific data to configure) + +NOTE that it is expected to be an error to have both managed and unmanaged replication +config variables set for a single driver. + +Cinder managed target device +----------------------------- + +In the case of a Cinder managed target device, we simply +use another Cinder configured backend as the replication +target. + +For example if we have two backend devices foo and biz that +can replicate to each other, we can set up backend biz as +a replication target for device foo using the following +config entries:: + + ..... + [driver-biz] + volume_driver=xxxx + volume_backend_name=biz + + [driver-foo] + volume_driver=xxxx + volume_backend_name=foo + managed_replication_target=True + replication_devices=volume_backend_name-1,volume_backend_name-2.... + +Notice that the only change from the usual driver configuration +section here is the addition of the replication_devices option. + + +Unmanaged target device +------------------------ + +In some cases the replication target device may not be a +configured Cinder backend. In this case it's the configured +drivers responsibility to route commands to the active device +and to update provider info to ensure the proper iSCSI targets +are being used. + +This type of config changes only slightly, and instead of using +a backend_name, it takes the vendor unique config options:: + + ..... + [driver-foo] + volume_driver=xxxx + volume_backend_name=foo + managed_replication_target=False + replication_devices={'key1'='val1' 'key2'='val2' ...}, + {'key7'='val7'....},... + +Note the key/value entries can be whatever the device requires, we treat the actual +variable in the config parser as a comma delimited list, the {} and = notations are +convenient/common parser delimeters, and the K/V entries are space seperated. + +We provide a literal evaluator to convert these entries into a proper dict, thus +format is extremely important here. + + +Volume Types / Extra Specs +--------------------------- +In order for a user to specify they'd like a replicated volume, there needs to be +a corresponding Volume Type created by the Cloud Administrator. + +There's a good deal of flexibility by using volume types. The scheduler can +send the create request to a backend that provides replication by simply +providing the replication=enabled key to the extra-specs of the volume type. + +For example, if the type was set to simply create the volume on any (or if you only had one) +backend that supports replication, the extra-specs entry would be:: + + {replication: enabled} + +If you needed to provide a specific backend device (multiple backends supporting replication):: + {replication: enabled, volume_backend_name: foo} + +Additionally you could provide additional details using scoped keys:: + {replication: enabled, volume_backend_name: foo, + replication:replication_type: async} + +Again, it's up to the driver to parse the volume type info on create and set things up +as requested. While the scoping key can be anything, it's strongly recommended that all +backends utilize the same key (replication) for consistency and to make things easier for +the Cloud Administrator. + +Capabilities reporting +---------------------- +The following entries are expected to be added to the stats/capabilities update for +replication configured devices:: + + stats["replication_enabled"] = True|False + stats["replication_type"] = ['async', 'sync'...] + stats["replication_count"] = len(self.cluster_pairs) + +Required methods +----------------- +The number of API methods associated with replication are intentionally very limited, and are +Admin only methods. + +They include:: + replication_enable(self, context, volume) + replication_disable(self, context, volume) + replication_failover(self, context, volume) + list_replication_targets(self, context) + +**replication_enable** + +Used to notify the driver that we would like to enable replication on a replication capable volume. +NOTE this is NOT used as the initial create replication command, that's handled by the volume-type at +create time. This is provided as a method for an Admin that may have needed to disable replication +on a volume for maintenance or whatever reason to signify that they'd like to "resume" replication on +the given volume. + +**replication_disable** + +Used to notify the driver that we would like to disable replication on a replication capable volume. +This again would be used by a Cloud Administrator for things like maintenance etc. + +**replication_failover** + +Used to instruct the backend to fail over to the secondary/target device on a replication capable volume. +This may be used for triggering a fail-over manually or for testing purposes. + +Note that ideally drivers will know how to update the volume reference properly so that Cinder is now +pointing to the secondary. Also, while it's not required, at this time; ideally the command would +act as a toggle, allowing to switch back and forth betweeen primary and secondary and back to primary. + +**list_replication_targets** + +Used by the admin to query a volume for a list of configured replication targets +The expected return for this call is expeceted to mimic the form used in the config file. + +For a volume replicating to managed replication targets:: + + {'volume_id': volume['id'], 'targets':[{'type': 'managed', + 'backend_name': 'backend_name'}...] + +For a volume replicating to external/unmanaged targets:: + + {'volume_id': volume['id'], 'targets':[{'type': 'unmanaged', + 'san_ip': '127.0.0.1', + 'san_login': 'admin'...}...] + diff --git a/etc/cinder/policy.json b/etc/cinder/policy.json index 5a520c528..7bbe49753 100644 --- a/etc/cinder/policy.json +++ b/etc/cinder/policy.json @@ -64,6 +64,11 @@ "volume_extension:replication:promote": "rule:admin_api", "volume_extension:replication:reenable": "rule:admin_api", + "volume:enable_replication": "rule:admin_api", + "volume:disable_replication": "rule:admin_api", + "volume:failover_replication": "rule:admin_api", + "volume:list_replication_targets": "rule:admin_api", + "backup:create" : "", "backup:delete": "", "backup:get": "", -- 2.45.2