From 05a516da01225bed8b99ca49e558d40d71df3fe1 Mon Sep 17 00:00:00 2001
From: LisaLi <xiaoyan.li@intel.com>
Date: Fri, 19 Feb 2016 09:28:36 +0100
Subject: [PATCH] Scalable backup service - Liberty compatibility

To support rolling upgrades we need to make sure that Mitaka's services
are running fine with Liberty's. It gets complicated with backups as
we've strongly reworked them. Main difference is that Mitaka c-bak can
handle backup/restore of any volume and Liberty was restricted to
operate only on volumes placed on the same node.

Now when running in version heterogeneous environment we need to use old
way of backup jobs scheduling and switch to new one (round robin) only
when everything is running Mitaka.

This commit implements that by adding a dummy backup RPC API version
(1.3) that marks the beginning of scalable backups era. Jobs are
scheduled the new way only if every c-bak reports that (or higher)
version.

There are also small changes to volume.rpcapi - to fail fast if some
c-vol services aren't supporting new calls required by scalable backups
feature. This allows us to error out backups with proper message when
upgrade was done in an improper way (in Mitaka we require c-vols to be
upgraded before c-baks).

This commit also includes small changes to CinderObjectSerializer to
block tries to "forwardport" an object when sending it over RPC. If a
service receives an older object it should handle it explicitly.

Related-Blueprint: scalable-backup-service
Co-Authored-By: Michal Dulko <michal.dulko@intel.com>
Change-Id: I45324336ba00726d53cfa012e8bd498868919a8c
---
 cinder/backup/api.py                          | 46 +++++++++++++++++--
 cinder/backup/manager.py                      |  2 +-
 cinder/backup/rpcapi.py                       |  5 +-
 cinder/exception.py                           |  4 ++
 cinder/objects/base.py                        | 17 ++++++-
 cinder/tests/unit/test_backup.py              | 45 ++++++++++++++++++
 cinder/tests/unit/test_volume_rpcapi.py       | 18 +++++++-
 cinder/volume/rpcapi.py                       | 12 +++++
 ...aling-backup-service-7e5058802d2fb3dc.yaml |  8 ++++
 9 files changed, 146 insertions(+), 11 deletions(-)
 create mode 100644 releasenotes/notes/scaling-backup-service-7e5058802d2fb3dc.yaml

diff --git a/cinder/backup/api.py b/cinder/backup/api.py
index 8dfd078cc..9ac21ebab 100644
--- a/cinder/backup/api.py
+++ b/cinder/backup/api.py
@@ -25,6 +25,7 @@ from oslo_config import cfg
 from oslo_log import log as logging
 from oslo_utils import excutils
 from oslo_utils import strutils
+from oslo_utils import versionutils
 from pytz import timezone
 import random
 
@@ -39,6 +40,7 @@ import cinder.policy
 from cinder import quota
 from cinder import utils
 import cinder.volume
+from cinder.volume import utils as volume_utils
 
 backup_api_opts = [
     cfg.BoolOpt('backup_use_same_backend',
@@ -135,6 +137,24 @@ class API(base.Base):
 
         return backups
 
+    def _is_scalable_only(self):
+        """True if we're running in deployment where all c-bak are scalable.
+
+        We need this method to decide if we can assume that all of our c-bak
+        services are decoupled from c-vol.
+
+        FIXME(dulek): This shouldn't be needed in Newton.
+        """
+        cap = self.backup_rpcapi.client.version_cap
+        if cap:
+            cap = versionutils.convert_version_to_tuple(cap)
+            return cap >= (1, 3)  # Mitaka is marked by c-bak 1.3+.
+        else:
+            # NOTE(dulek): No version cap means we're running in an environment
+            # without c-bak services. Letting it pass as Mitaka, request will
+            # just fail anyway so it doesn't really matter.
+            return True
+
     def _az_matched(self, service, availability_zone):
         return ((not availability_zone) or
                 service.availability_zone == availability_zone)
@@ -170,14 +190,29 @@ class API(base.Base):
             idx = idx + 1
         return None
 
-    def _get_available_backup_service_host(self, host, availability_zone):
+    def _get_available_backup_service_host(self, host, az, volume_host=None):
         """Return an appropriate backup service host."""
+
+        # FIXME(dulek): We need to keep compatibility with Liberty, where c-bak
+        # were coupled with c-vol. If we're running in mixed Liberty-Mitaka
+        # environment we will be scheduling backup jobs the old way.
+        #
+        # This snippet should go away in Newton. Note that volume_host
+        # parameter will also be unnecessary then.
+        if not self._is_scalable_only():
+            if volume_host and self._is_backup_service_enabled(az,
+                                                               volume_host):
+                return volume_host
+            elif host and self._is_backup_service_enabled(az, host):
+                return host
+            else:
+                raise exception.ServiceNotFound(service_id='cinder-backup')
+
         backup_host = None
-        if host and self._is_backup_service_enabled(availability_zone, host):
+        if host and self._is_backup_service_enabled(az, host):
             backup_host = host
         if not backup_host and (not host or CONF.backup_use_same_backend):
-            backup_host = self._get_any_available_backup_service(
-                availability_zone)
+            backup_host = self._get_any_available_backup_service(az)
         if not backup_host:
             raise exception.ServiceNotFound(service_id='cinder-backup')
         return backup_host
@@ -225,7 +260,8 @@ class API(base.Base):
 
         previous_status = volume['status']
         host = self._get_available_backup_service_host(
-            None, volume.availability_zone)
+            None, volume.availability_zone,
+            volume_utils.extract_host(volume.host, 'host'))
 
         # Reserve a quota before setting volume status and backup status
         try:
diff --git a/cinder/backup/manager.py b/cinder/backup/manager.py
index 3aa486d8b..abca503aa 100644
--- a/cinder/backup/manager.py
+++ b/cinder/backup/manager.py
@@ -81,7 +81,7 @@ QUOTAS = quota.QUOTAS
 class BackupManager(manager.SchedulerDependentManager):
     """Manages backup of block storage devices."""
 
-    RPC_API_VERSION = '1.2'
+    RPC_API_VERSION = '1.3'
 
     target = messaging.Target(version=RPC_API_VERSION)
 
diff --git a/cinder/backup/rpcapi.py b/cinder/backup/rpcapi.py
index 9ed3ccacc..245cc07b8 100644
--- a/cinder/backup/rpcapi.py
+++ b/cinder/backup/rpcapi.py
@@ -35,9 +35,12 @@ class BackupAPI(rpc.RPCAPI):
 
         1.0 - Initial version.
         1.1 - Changed methods to accept backup objects instead of IDs.
+        1.2 - A version that got in by mistake (without breaking anything).
+        1.3 - Dummy version bump to mark start of having cinder-backup service
+              decoupled from cinder-volume.
     """
 
-    RPC_API_VERSION = '1.1'
+    RPC_API_VERSION = '1.3'
     TOPIC = CONF.backup_topic
     BINARY = 'cinder-backup'
 
diff --git a/cinder/exception.py b/cinder/exception.py
index 88b3c3da9..db01e8276 100644
--- a/cinder/exception.py
+++ b/cinder/exception.py
@@ -351,6 +351,10 @@ class ServiceNotFound(NotFound):
     message = _("Service %(service_id)s could not be found.")
 
 
+class ServiceTooOld(Invalid):
+    message = _("Service is too old to fulfil this request.")
+
+
 class HostNotFound(NotFound):
     message = _("Host %(host)s could not be found.")
 
diff --git a/cinder/objects/base.py b/cinder/objects/base.py
index 90174fb6d..78d5b8002 100644
--- a/cinder/objects/base.py
+++ b/cinder/objects/base.py
@@ -18,6 +18,7 @@ import contextlib
 import datetime
 
 from oslo_log import log as logging
+from oslo_utils import versionutils
 from oslo_versionedobjects import base
 from oslo_versionedobjects import fields
 
@@ -394,8 +395,20 @@ class CinderObjectSerializer(base.VersionedObjectSerializer):
 
     def _get_capped_obj_version(self, obj):
         objname = obj.obj_name()
-        objver = OBJ_VERSIONS.get(self.version_cap, {})
-        return objver.get(objname, None)
+        version_dict = OBJ_VERSIONS.get(self.version_cap, {})
+        version_cap = version_dict.get(objname, None)
+
+        if version_cap:
+            cap_tuple = versionutils.convert_version_to_tuple(version_cap)
+            obj_tuple = versionutils.convert_version_to_tuple(obj.VERSION)
+            if cap_tuple > obj_tuple:
+                # NOTE(dulek): Do not set version cap to be higher than actual
+                # object version as we don't support "forwardporting" of
+                # objects. If service will receive an object that's too old it
+                # should handle it explicitly.
+                version_cap = None
+
+        return version_cap
 
     def serialize_entity(self, context, entity):
         if isinstance(entity, (tuple, list, set, dict)):
diff --git a/cinder/tests/unit/test_backup.py b/cinder/tests/unit/test_backup.py
index 0796911ad..bbbf9a798 100644
--- a/cinder/tests/unit/test_backup.py
+++ b/cinder/tests/unit/test_backup.py
@@ -508,6 +508,26 @@ class BackupTestCase(BaseBackupTest):
         self.assertEqual(fields.BackupStatus.ERROR, backup['status'])
         self.assertTrue(mock_run_backup.called)
 
+    @mock.patch('cinder.utils.brick_get_connector_properties')
+    @mock.patch('cinder.utils.temporary_chown')
+    @mock.patch('six.moves.builtins.open')
+    def test_create_backup_old_volume_service(self, mock_open,
+                                              mock_temporary_chown,
+                                              mock_get_backup_device):
+        """Test error handling when there's too old volume service in env."""
+        vol_id = self._create_volume_db_entry(size=1)
+        backup = self._create_backup_db_entry(volume_id=vol_id)
+
+        with mock.patch.object(self.backup_mgr.volume_rpcapi.client,
+                               'version_cap', '1.37'):
+            self.assertRaises(exception.ServiceTooOld,
+                              self.backup_mgr.create_backup, self.ctxt, backup)
+            vol = db.volume_get(self.ctxt, vol_id)
+            self.assertEqual('available', vol['status'])
+            self.assertEqual('error_backing-up', vol['previous_status'])
+            backup = db.backup_get(self.ctxt, backup.id)
+            self.assertEqual(fields.BackupStatus.ERROR, backup['status'])
+
     @mock.patch('cinder.utils.brick_get_connector_properties')
     @mock.patch('cinder.volume.rpcapi.VolumeAPI.get_backup_device')
     @mock.patch('cinder.utils.temporary_chown')
@@ -619,6 +639,31 @@ class BackupTestCase(BaseBackupTest):
         self.assertEqual(fields.BackupStatus.AVAILABLE, backup['status'])
         self.assertTrue(mock_run_restore.called)
 
+    @mock.patch('cinder.utils.brick_get_connector_properties')
+    def test_restore_backup_with_old_volume_service(self, mock_get_conn):
+        """Test error handling when an error occurs during backup restore."""
+        vol_id = self._create_volume_db_entry(status='restoring-backup',
+                                              size=1)
+        backup = self._create_backup_db_entry(
+            status=fields.BackupStatus.RESTORING, volume_id=vol_id)
+
+        # Unmock secure_file_operations_enabled
+        self.volume_patches['secure_file_operations_enabled'].stop()
+
+        with mock.patch.object(self.backup_mgr.volume_rpcapi.client,
+                               'version_cap', '1.37'):
+            self.assertRaises(exception.ServiceTooOld,
+                              self.backup_mgr.restore_backup,
+                              self.ctxt,
+                              backup,
+                              vol_id)
+            vol = db.volume_get(self.ctxt, vol_id)
+            self.assertEqual('error_restoring', vol['status'])
+            backup = db.backup_get(self.ctxt, backup.id)
+            self.assertEqual(fields.BackupStatus.AVAILABLE, backup['status'])
+
+        self.volume_patches['secure_file_operations_enabled'].start()
+
     def test_restore_backup_with_bad_service(self):
         """Test error handling.
 
diff --git a/cinder/tests/unit/test_volume_rpcapi.py b/cinder/tests/unit/test_volume_rpcapi.py
index 04119477e..aa34d7eb1 100644
--- a/cinder/tests/unit/test_volume_rpcapi.py
+++ b/cinder/tests/unit/test_volume_rpcapi.py
@@ -24,6 +24,7 @@ from oslo_serialization import jsonutils
 
 from cinder import context
 from cinder import db
+from cinder import exception
 from cinder import objects
 from cinder import test
 from cinder.tests.unit import fake_backup
@@ -599,15 +600,28 @@ class VolumeRpcAPITestCase(test.TestCase):
                               volume=self.fake_volume,
                               version='1.30')
 
-    def test_get_backup_device(self):
+    @mock.patch('oslo_messaging.RPCClient.can_send_version', return_value=True)
+    def test_get_backup_device(self, mock_can_send_version):
         self._test_volume_api('get_backup_device',
                               rpc_method='call',
                               backup=self.fake_backup_obj,
                               volume=self.fake_volume_obj,
                               version='1.38')
 
-    def test_secure_file_operations_enabled(self):
+        mock_can_send_version.return_value = False
+        self.assertRaises(exception.ServiceTooOld, self._test_volume_api,
+                          'get_backup_device', rpc_method='call',
+                          backup=self.fake_backup_obj,
+                          volume=self.fake_volume_obj, version='1.38')
+
+    @mock.patch('oslo_messaging.RPCClient.can_send_version', return_value=True)
+    def test_secure_file_operations_enabled(self, mock_can_send_version):
         self._test_volume_api('secure_file_operations_enabled',
                               rpc_method='call',
                               volume=self.fake_volume_obj,
                               version='1.38')
+
+        mock_can_send_version.return_value = False
+        self.assertRaises(exception.ServiceTooOld, self._test_volume_api,
+                          'secure_file_operations_enabled', rpc_method='call',
+                          volume=self.fake_volume_obj, version='1.38')
diff --git a/cinder/volume/rpcapi.py b/cinder/volume/rpcapi.py
index 238216cac..8bf81eae5 100644
--- a/cinder/volume/rpcapi.py
+++ b/cinder/volume/rpcapi.py
@@ -19,6 +19,8 @@ Client side of the volume RPC API.
 from oslo_config import cfg
 from oslo_serialization import jsonutils
 
+from cinder import exception
+from cinder.i18n import _
 from cinder import quota
 from cinder import rpc
 from cinder.volume import utils
@@ -334,12 +336,22 @@ class VolumeAPI(rpc.RPCAPI):
         return cctxt.call(ctxt, 'get_capabilities', discover=discover)
 
     def get_backup_device(self, ctxt, backup, volume):
+        if not self.client.can_send_version('1.38'):
+            msg = _('One of cinder-volume services is too old to accept such '
+                    'request. Are you running mixed Liberty-Mitaka '
+                    'cinder-volumes?')
+            raise exception.ServiceTooOld(msg)
         new_host = utils.extract_host(volume.host)
         cctxt = self.client.prepare(server=new_host, version='1.38')
         return cctxt.call(ctxt, 'get_backup_device',
                           backup=backup)
 
     def secure_file_operations_enabled(self, ctxt, volume):
+        if not self.client.can_send_version('1.38'):
+            msg = _('One of cinder-volume services is too old to accept such '
+                    'request. Are you running mixed Liberty-Mitaka '
+                    'cinder-volumes?')
+            raise exception.ServiceTooOld(msg)
         new_host = utils.extract_host(volume.host)
         cctxt = self.client.prepare(server=new_host, version='1.38')
         return cctxt.call(ctxt, 'secure_file_operations_enabled',
diff --git a/releasenotes/notes/scaling-backup-service-7e5058802d2fb3dc.yaml b/releasenotes/notes/scaling-backup-service-7e5058802d2fb3dc.yaml
new file mode 100644
index 000000000..8a8e49b2b
--- /dev/null
+++ b/releasenotes/notes/scaling-backup-service-7e5058802d2fb3dc.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - cinder-backup service is now decoupled from
+    cinder-volume, which allows more flexible scaling.
+upgrade:
+  - As cinder-backup was strongly reworked in this
+    release, the recommended upgrade order when executing
+    live (rolling) upgrade is c-api->c-sch->c-vol->c-bak.
-- 
2.45.2