From: Zhiteng Huang Date: Fri, 25 Jan 2013 18:18:03 +0000 (+0800) Subject: Allow create_volume() to retry when exception happened X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=d17cc23c648f98d0764748e9007ff4b8742e6572;p=openstack-build%2Fcinder-build.git Allow create_volume() to retry when exception happened Due to the fact that certain volume back-ends cannot easily report simple total_capacity_gb/free_capacity_gb for their internal implementation complexity, scheduler is updated to let those back-ends who report unclear capacity pass capacity filter, thus there is chance create_volume() request would fail. In a more general case, when a volume back-end failed to serve create_volume request for whatever reason it'd be good that we have a mechanism to 'retry' the request. So the idea is when volume manager catches the exception from driver.create_volume() call, it checks if the request is allowed to be rescheduled (requests that are not: clone volume and create volume from snapshot while 'snapshot_same_host' option is true), it composes a new request back to scheduler with additional information to mark this specific back-end has been tried (so that scheduler may choose to skip this back-end if needed). Scheduler is (filter scheduler only, simple and chance scheduler is not supported) is updated as well so that it only retry scheduler_max_attempts times. In order to skip/rule out previously tried back-ends in next schedule task, a new RetryFilter is added. Changes: 1) volume RPC API create_volume() is updated with new parameters to save original request information in case rescheduling is needed. This bumps volume RPC API to 1.4. 2) implementation of create_volume() method in volume API is refactored in order to distinguish if a request is allowed to do reschedule (i.e. currently create volume from source volume bypasses scheduler, not rescheduling is allowed). 3) add reschedule functionality in create_volume() of volume manager so that it's able to send the request back to scheduler. 4) add schedule_max_attempts config option in scheduler/driver.py 5) add RetryFitler 6) change scheduler_driver default option to FilterScheduler Change-Id: Ia46b5eb4dc033d73734b6aea82ada34ba5731075 --- diff --git a/cinder/scheduler/driver.py b/cinder/scheduler/driver.py index f7d72983f..a5329ff0e 100644 --- a/cinder/scheduler/driver.py +++ b/cinder/scheduler/driver.py @@ -33,7 +33,11 @@ from cinder.volume import rpcapi as volume_rpcapi scheduler_driver_opts = [ cfg.StrOpt('scheduler_host_manager', default='cinder.scheduler.host_manager.HostManager', - help='The scheduler host manager class to use'), ] + help='The scheduler host manager class to use'), + cfg.IntOpt('scheduler_max_attempts', + default=3, + help='Maximum number of attempts to schedule an volume'), +] FLAGS = flags.FLAGS FLAGS.register_opts(scheduler_driver_opts) diff --git a/cinder/scheduler/filter_scheduler.py b/cinder/scheduler/filter_scheduler.py index ea8dc4ed7..24feebb4c 100644 --- a/cinder/scheduler/filter_scheduler.py +++ b/cinder/scheduler/filter_scheduler.py @@ -40,6 +40,7 @@ class FilterScheduler(driver.Scheduler): super(FilterScheduler, self).__init__(*args, **kwargs) self.cost_function_cache = None self.options = scheduler_options.SchedulerOptions() + self.max_attempts = self._max_attempts() def schedule(self, context, topic, method, *args, **kwargs): """The schedule() contract requires we return the one @@ -74,8 +75,91 @@ class FilterScheduler(driver.Scheduler): image_id = request_spec['image_id'] updated_volume = driver.volume_update_db(context, volume_id, host) + self._post_select_populate_filter_properties(filter_properties, + weighed_host.obj) + + # context is not serializable + filter_properties.pop('context', None) + self.volume_rpcapi.create_volume(context, updated_volume, host, - snapshot_id, image_id) + request_spec=request_spec, + filter_properties=filter_properties, + allow_reschedule=True, + snapshot_id=snapshot_id, + image_id=image_id) + + def _post_select_populate_filter_properties(self, filter_properties, + host_state): + """Add additional information to the filter properties after a host has + been selected by the scheduling process. + """ + # Add a retry entry for the selected volume backend: + self._add_retry_host(filter_properties, host_state.host) + + def _add_retry_host(self, filter_properties, host): + """Add a retry entry for the selected volume backend. In the event that + the request gets re-scheduled, this entry will signal that the given + backend has already been tried. + """ + retry = filter_properties.get('retry', None) + if not retry: + return + hosts = retry['hosts'] + hosts.append(host) + + def _max_attempts(self): + max_attempts = FLAGS.scheduler_max_attempts + if max_attempts < 1: + msg = _("Invalid value for 'scheduler_max_attempts', " + "must be >=1") + raise exception.InvalidParameterValue(err=msg) + return max_attempts + + def _log_volume_error(self, volume_id, retry): + """If the request contained an exception from a previous volume + create operation, log it to aid debugging + """ + exc = retry.pop('exc', None) # string-ified exception from volume + if not exc: + return # no exception info from a previous attempt, skip + + hosts = retry.get('hosts', None) + if not hosts: + return # no previously attempted hosts, skip + + last_host = hosts[-1] + msg = _("Error from last vol-service: %(last_host)s : " + "%(exc)s") % locals() + LOG.error(msg, volume_id=volume_id) + + def _populate_retry(self, filter_properties, properties): + """Populate filter properties with history of retries for this + request. If maximum retries is exceeded, raise NoValidHost. + """ + max_attempts = self.max_attempts + retry = filter_properties.pop('retry', {}) + + if max_attempts == 1: + # re-scheduling is disabled. + return + + # retry is enabled, update attempt count: + if retry: + retry['num_attempts'] += 1 + else: + retry = { + 'num_attempts': 1, + 'hosts': [] # list of volume service hosts tried + } + filter_properties['retry'] = retry + + volume_id = properties.get('volume_id') + self._log_volume_error(volume_id, retry) + + if retry['num_attempts'] > max_attempts: + msg = _("Exceeded max scheduling attempts %(max_attempts)d for " + "volume %(volume_id)s") % locals() + raise exception.NoValidHost(reason=msg) def _schedule(self, context, request_spec, filter_properties=None): """Returns a list of hosts that meet the required specs, @@ -84,9 +168,9 @@ class FilterScheduler(driver.Scheduler): elevated = context.elevated() volume_properties = request_spec['volume_properties'] - # Since Nova is using mixed filters from Oslo and it's own, which - # takes 'resource_XX' and 'instance_XX' as input respectively, copying - # 'instance_XX' to 'resource_XX' will make both filters happy. + # Since Cinder is using mixed filters from Oslo and it's own, which + # takes 'resource_XX' and 'volume_XX' as input respectively, copying + # 'volume_XX' to 'resource_XX' will make both filters happy. resource_properties = volume_properties.copy() volume_type = request_spec.get("volume_type", None) resource_type = request_spec.get("volume_type", None) @@ -96,6 +180,8 @@ class FilterScheduler(driver.Scheduler): if filter_properties is None: filter_properties = {} + self._populate_retry(filter_properties, resource_properties) + filter_properties.update({'context': context, 'request_spec': request_spec, 'config_options': config_options, diff --git a/cinder/scheduler/filters/retry_filter.py b/cinder/scheduler/filters/retry_filter.py new file mode 100644 index 000000000..ae84a4e27 --- /dev/null +++ b/cinder/scheduler/filters/retry_filter.py @@ -0,0 +1,45 @@ +# Copyright (c) 2012 OpenStack, LLC. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from cinder.openstack.common import log as logging +from cinder.openstack.common.scheduler import filters + +LOG = logging.getLogger(__name__) + + +class RetryFilter(filters.BaseHostFilter): + """Filter out nodes that have already been attempted for scheduling + purposes + """ + + def host_passes(self, host_state, filter_properties): + """Skip nodes that have already been attempted.""" + retry = filter_properties.get('retry', None) + if not retry: + # Re-scheduling is disabled + LOG.debug("Re-scheduling is disabled") + return True + + hosts = retry.get('hosts', []) + host = host_state.host + + passes = host not in hosts + pass_msg = "passes" if passes else "fails" + + LOG.debug(_("Host %(host)s %(pass_msg)s. Previously tried hosts: " + "%(hosts)s") % locals()) + + # Host passes if it's not in the list of previously attempted hosts: + return passes diff --git a/cinder/scheduler/manager.py b/cinder/scheduler/manager.py index f3f170dc6..92247a89c 100644 --- a/cinder/scheduler/manager.py +++ b/cinder/scheduler/manager.py @@ -37,8 +37,8 @@ from cinder.volume import rpcapi as volume_rpcapi LOG = logging.getLogger(__name__) scheduler_driver_opt = cfg.StrOpt('scheduler_driver', - default='cinder.scheduler.simple.' - 'SimpleScheduler', + default='cinder.scheduler.filter_scheduler.' + 'FilterScheduler', help='Default scheduler driver to use') FLAGS = flags.FLAGS diff --git a/cinder/tests/scheduler/test_filter_scheduler.py b/cinder/tests/scheduler/test_filter_scheduler.py index 064c27461..6579d6cb9 100644 --- a/cinder/tests/scheduler/test_filter_scheduler.py +++ b/cinder/tests/scheduler/test_filter_scheduler.py @@ -22,6 +22,7 @@ from cinder import test from cinder.openstack.common.scheduler import weights from cinder.scheduler import filter_scheduler +from cinder.scheduler import host_manager from cinder.tests.scheduler import fakes from cinder.tests.scheduler import test_scheduler from cinder.tests import utils as test_utils @@ -53,7 +54,7 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): 'volume_type': {'name': 'LVM_iSCSI'}, 'volume_id': ['fake-id1']} self.assertRaises(exception.NoValidHost, sched.schedule_create_volume, - fake_context, request_spec, None) + fake_context, request_spec, {}) @test.skip_if(not test_utils.is_cinder_installed(), 'Test requires Cinder installed (try setup.py develop') @@ -78,7 +79,7 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): 'volume_type': {'name': 'LVM_iSCSI'}, 'volume_id': ['fake-id1']} self.assertRaises(exception.NoValidHost, sched.schedule_create_volume, - fake_context, request_spec, None) + fake_context, request_spec, {}) self.assertTrue(self.was_admin) @test.skip_if(not test_utils.is_cinder_installed(), @@ -110,3 +111,108 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): self.mox.ReplayAll() weighed_host = sched._schedule(fake_context, request_spec, {}) self.assertTrue(weighed_host.obj is not None) + + def test_max_attempts(self): + self.flags(scheduler_max_attempts=4) + + sched = fakes.FakeFilterScheduler() + self.assertEqual(4, sched._max_attempts()) + + def test_invalid_max_attempts(self): + self.flags(scheduler_max_attempts=0) + + self.assertRaises(exception.InvalidParameterValue, + fakes.FakeFilterScheduler) + + def test_retry_disabled(self): + # Retry info should not get populated when re-scheduling is off. + self.flags(scheduler_max_attempts=1) + sched = fakes.FakeFilterScheduler() + + request_spec = {'volume_type': {'name': 'LVM_iSCSI'}, + 'volume_properties': {'project_id': 1, + 'size': 1}} + filter_properties = {} + + sched._schedule(self.context, request_spec, + filter_properties=filter_properties) + + # should not have retry info in the populated filter properties: + self.assertFalse("retry" in filter_properties) + + def test_retry_attempt_one(self): + # Test retry logic on initial scheduling attempt. + self.flags(scheduler_max_attempts=2) + sched = fakes.FakeFilterScheduler() + + request_spec = {'volume_type': {'name': 'LVM_iSCSI'}, + 'volume_properties': {'project_id': 1, + 'size': 1}} + filter_properties = {} + + sched._schedule(self.context, request_spec, + filter_properties=filter_properties) + + num_attempts = filter_properties['retry']['num_attempts'] + self.assertEqual(1, num_attempts) + + def test_retry_attempt_two(self): + # Test retry logic when re-scheduling. + self.flags(scheduler_max_attempts=2) + sched = fakes.FakeFilterScheduler() + + request_spec = {'volume_type': {'name': 'LVM_iSCSI'}, + 'volume_properties': {'project_id': 1, + 'size': 1}} + + retry = dict(num_attempts=1) + filter_properties = dict(retry=retry) + + sched._schedule(self.context, request_spec, + filter_properties=filter_properties) + + num_attempts = filter_properties['retry']['num_attempts'] + self.assertEqual(2, num_attempts) + + def test_retry_exceeded_max_attempts(self): + # Test for necessary explosion when max retries is exceeded. + self.flags(scheduler_max_attempts=2) + sched = fakes.FakeFilterScheduler() + + request_spec = {'volume_type': {'name': 'LVM_iSCSI'}, + 'volume_properties': {'project_id': 1, + 'size': 1}} + + retry = dict(num_attempts=2) + filter_properties = dict(retry=retry) + + self.assertRaises(exception.NoValidHost, sched._schedule, self.context, + request_spec, filter_properties=filter_properties) + + def test_add_retry_host(self): + retry = dict(num_attempts=1, hosts=[]) + filter_properties = dict(retry=retry) + host = "fakehost" + + sched = fakes.FakeFilterScheduler() + sched._add_retry_host(filter_properties, host) + + hosts = filter_properties['retry']['hosts'] + self.assertEqual(1, len(hosts)) + self.assertEqual(host, hosts[0]) + + def test_post_select_populate(self): + # Test addition of certain filter props after a node is selected. + retry = {'hosts': [], 'num_attempts': 1} + filter_properties = {'retry': retry} + sched = fakes.FakeFilterScheduler() + + host_state = host_manager.HostState('host') + host_state.total_capacity_gb = 1024 + sched._post_select_populate_filter_properties(filter_properties, + host_state) + + self.assertEqual('host', + filter_properties['retry']['hosts'][0]) + + self.assertEqual(1024, host_state.total_capacity_gb) diff --git a/cinder/tests/scheduler/test_host_filters.py b/cinder/tests/scheduler/test_host_filters.py index a6fb856cb..810056fd1 100644 --- a/cinder/tests/scheduler/test_host_filters.py +++ b/cinder/tests/scheduler/test_host_filters.py @@ -128,3 +128,32 @@ class HostFiltersTestCase(test.TestCase): 'updated_at': None, 'service': service}) self.assertTrue(filt_cls.host_passes(host, filter_properties)) + + @test.skip_if(not test_utils.is_cinder_installed(), + 'Test requires Cinder installed') + def test_retry_filter_disabled(self): + # Test case where retry/re-scheduling is disabled. + filt_cls = self.class_map['RetryFilter']() + host = fakes.FakeHostState('host1', {}) + filter_properties = {} + self.assertTrue(filt_cls.host_passes(host, filter_properties)) + + @test.skip_if(not test_utils.is_cinder_installed(), + 'Test requires Cinder installed') + def test_retry_filter_pass(self): + # Node not previously tried. + filt_cls = self.class_map['RetryFilter']() + host = fakes.FakeHostState('host1', {}) + retry = dict(num_attempts=2, hosts=['host2']) + filter_properties = dict(retry=retry) + self.assertTrue(filt_cls.host_passes(host, filter_properties)) + + @test.skip_if(not test_utils.is_cinder_installed(), + 'Test requires Cinder installed') + def test_retry_filter_fail(self): + # Node was already tried. + filt_cls = self.class_map['RetryFilter']() + host = fakes.FakeHostState('host1', {}) + retry = dict(num_attempts=1, hosts=['host1']) + filter_properties = dict(retry=retry) + self.assertFalse(filt_cls.host_passes(host, filter_properties)) diff --git a/cinder/tests/test_volume.py b/cinder/tests/test_volume.py index 6170da2e9..1b661f340 100644 --- a/cinder/tests/test_volume.py +++ b/cinder/tests/test_volume.py @@ -508,7 +508,8 @@ class VolumeTestCase(test.TestCase): def fake_local_path(volume): return dst_path - def fake_copy_image_to_volume(context, volume, image_id): + def fake_copy_image_to_volume(context, volume, + image_service, image_id): pass def fake_fetch_to_raw(context, image_service, image_id, vol_path): @@ -545,11 +546,6 @@ class VolumeTestCase(test.TestCase): db.volume_destroy(self.context, volume_id) os.unlink(dst_path) - def test_create_volume_from_image_status_downloading(self): - """Verify that before copying image to volume, it is in downloading - state.""" - self._create_volume_from_image('downloading', True) - def test_create_volume_from_image_status_available(self): """Verify that before copying image to volume, it is in available state.""" @@ -577,7 +573,7 @@ class VolumeTestCase(test.TestCase): self.assertRaises(exception.ImageNotFound, self.volume.create_volume, self.context, - volume_id, + volume_id, None, None, None, None, image_id) volume = db.volume_get(self.context, volume_id) diff --git a/cinder/tests/test_volume_rpcapi.py b/cinder/tests/test_volume_rpcapi.py index 5fdc8092a..7a75224b8 100644 --- a/cinder/tests/test_volume_rpcapi.py +++ b/cinder/tests/test_volume_rpcapi.py @@ -112,10 +112,13 @@ class VolumeRpcAPITestCase(test.TestCase): rpc_method='cast', volume=self.fake_volume, host='fake_host1', + request_spec='fake_request_spec', + filter_properties='fake_properties', + allow_reschedule=True, snapshot_id='fake_snapshot_id', image_id='fake_image_id', source_volid='fake_src_id', - version='1.1') + version='1.4') def test_delete_volume(self): self._test_volume_api('delete_volume', diff --git a/cinder/volume/api.py b/cinder/volume/api.py index eadced454..ffd9d8bc1 100644 --- a/cinder/volume/api.py +++ b/cinder/volume/api.py @@ -242,8 +242,12 @@ class API(base.Base): self.volume_rpcapi.create_volume(context, volume_ref, volume_ref['host'], - snapshot_id, - image_id) + request_spec=request_spec, + filter_properties= + filter_properties, + allow_reschedule=False, + snapshot_id=snapshot_id, + image_id=image_id) elif source_volid: source_volume_ref = self.db.volume_get(context, source_volid) @@ -255,18 +259,22 @@ class API(base.Base): self.volume_rpcapi.create_volume(context, volume_ref, volume_ref['host'], - snapshot_id, - image_id, - source_volid) + request_spec=request_spec, + filter_properties= + filter_properties, + allow_reschedule=False, + snapshot_id=snapshot_id, + image_id=image_id, + source_volid=source_volid) else: - self.scheduler_rpcapi.create_volume( - context, - FLAGS.volume_topic, - volume_id, - snapshot_id, - image_id, - request_spec=request_spec, - filter_properties=filter_properties) + self.scheduler_rpcapi.create_volume(context, + FLAGS.volume_topic, + volume_id, + snapshot_id, + image_id, + request_spec=request_spec, + filter_properties= + filter_properties) @wrap_check_policy def delete(self, context, volume, force=False): diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py index 2b88d9a61..f194f624c 100644 --- a/cinder/volume/manager.py +++ b/cinder/volume/manager.py @@ -37,6 +37,10 @@ intact. """ + +import sys +import traceback + from cinder import context from cinder import exception from cinder import flags @@ -103,7 +107,7 @@ MAPPING = { class VolumeManager(manager.SchedulerDependentManager): """Manages attachable block storage devices.""" - RPC_API_VERSION = '1.3' + RPC_API_VERSION = '1.4' def __init__(self, volume_driver=None, *args, **kwargs): """Load the driver from the one specified in args, or from flags.""" @@ -146,10 +150,44 @@ class VolumeManager(manager.SchedulerDependentManager): # collect and publish service capabilities self.publish_service_capabilities(ctxt) - def create_volume(self, context, volume_id, snapshot_id=None, - image_id=None, source_volid=None): + def _create_volume(self, context, volume_ref, snapshot_ref, + srcvol_ref, image_service, image_id, image_location): + cloned = None + model_update = False + + if all(x is None for x in(snapshot_ref, image_location, srcvol_ref)): + model_update = self.driver.create_volume(volume_ref) + elif snapshot_ref is not None: + model_update = self.driver.create_volume_from_snapshot( + volume_ref, + snapshot_ref) + elif srcvol_ref is not None: + model_update = self.driver.create_cloned_volume(volume_ref, + srcvol_ref) + else: + # create the volume from an image + cloned = self.driver.clone_image(volume_ref, image_location) + if not cloned: + model_update = self.driver.create_volume(volume_ref) + #copy the image onto the volume. + status = 'downloading' + self.db.volume_update(context, + volume_ref['id'], + {'status': status}) + self._copy_image_to_volume(context, + volume_ref, + image_service, + image_id) + + return model_update, cloned + + def create_volume(self, context, volume_id, request_spec=None, + filter_properties=None, allow_reschedule=True, + snapshot_id=None, image_id=None, source_volid=None): """Creates and exports the volume.""" context = context.elevated() + if filter_properties is None: + filter_properties = {} volume_ref = self.db.volume_get(context, volume_id) self._notify_about_volume_usage(context, volume_ref, "create.start") LOG.info(_("volume %s: creating"), volume_ref['name']) @@ -167,36 +205,52 @@ class VolumeManager(manager.SchedulerDependentManager): vol_size = volume_ref['size'] LOG.debug(_("volume %(vol_name)s: creating lv of" " size %(vol_size)sG") % locals()) - if all(x is None for x in(snapshot_id, image_id, source_volid)): - model_update = self.driver.create_volume(volume_ref) - elif snapshot_id is not None: + snapshot_ref = None + sourcevol_ref = None + image_service = None + image_location = None + image_meta = None + + if snapshot_id is not None: snapshot_ref = self.db.snapshot_get(context, snapshot_id) - model_update = self.driver.create_volume_from_snapshot( - volume_ref, - snapshot_ref) elif source_volid is not None: - src_vref = self.db.volume_get(context, source_volid) - model_update = self.driver.create_cloned_volume(volume_ref, - src_vref) - self.db.volume_glance_metadata_copy_from_volume_to_volume( - context, - source_volid, - volume_id) - else: + sourcevol_ref = self.db.volume_get(context, source_volid) + elif image_id is not None: # create the volume from an image image_service, image_id = \ glance.get_remote_image_service(context, image_id) image_location = image_service.get_location(context, image_id) image_meta = image_service.show(context, image_id) - cloned = self.driver.clone_image(volume_ref, image_location) - if not cloned: - model_update = self.driver.create_volume(volume_ref) - status = 'downloading' + + try: + model_update, cloned = self._create_volume(context, + volume_ref, + snapshot_ref, + sourcevol_ref, + image_service, + image_id, + image_location) + except Exception: + # restore source volume status before reschedule + if sourcevol_ref is not None: + self.db.volume_update(context, sourcevol_ref['id'], + {'status': sourcevol_ref['status']}) + exc_info = sys.exc_info() + # try to re-schedule volume: + self._reschedule_or_reraise(context, volume_id, exc_info, + snapshot_id, image_id, + request_spec, filter_properties, + allow_reschedule) if model_update: volume_ref = self.db.volume_update( context, volume_ref['id'], model_update) + if sourcevol_ref is not None: + self.db.volume_glance_metadata_copy_from_volume_to_volume( + context, + source_volid, + volume_id) LOG.debug(_("volume %s: creating export"), volume_ref['name']) model_update = self.driver.create_export(context, volume_ref) @@ -214,13 +268,6 @@ class VolumeManager(manager.SchedulerDependentManager): volume_ref['id'], snapshot_id) - now = timeutils.utcnow() - self.db.volume_update(context, - volume_ref['id'], {'status': status, - 'launched_at': now}) - LOG.debug(_("volume %s: created successfully"), volume_ref['name']) - self._reset_stats() - if image_id and not cloned: if image_meta: # Copy all of the Glance image properties to the @@ -239,11 +286,91 @@ class VolumeManager(manager.SchedulerDependentManager): volume_ref['id'], key, value) - # Copy the image onto the volume. - self._copy_image_to_volume(context, volume_ref, image_id) + now = timeutils.utcnow() + self.db.volume_update(context, + volume_ref['id'], {'status': status, + 'launched_at': now}) + LOG.debug(_("volume %s: created successfully"), volume_ref['name']) + self._reset_stats() + self._notify_about_volume_usage(context, volume_ref, "create.end") return volume_ref['id'] + def _log_original_error(self, exc_info, volume_id): + type_, value, tb = exc_info + LOG.error(_('Error: %s') % + traceback.format_exception(type_, value, tb), + volume_id=volume_id) + + def _reschedule_or_reraise(self, context, volume_id, exc_info, + snapshot_id, image_id, request_spec, + filter_properties, allow_reschedule): + """Try to re-schedule the create or re-raise the original error to + error out the volume. + """ + if not allow_reschedule: + raise exc_info[0], exc_info[1], exc_info[2] + + rescheduled = False + + try: + method_args = (FLAGS.volume_topic, volume_id, snapshot_id, + image_id, request_spec, filter_properties) + + rescheduled = self._reschedule(context, request_spec, + filter_properties, volume_id, + self.scheduler_rpcapi.create_volume, + method_args, + exc_info) + + except Exception: + rescheduled = False + LOG.exception(_("Error trying to reschedule"), + volume_id=volume_id) + + if rescheduled: + # log the original build error + self._log_original_error(exc_info, volume_id) + else: + # not re-scheduling + raise exc_info[0], exc_info[1], exc_info[2] + + def _reschedule(self, context, request_spec, filter_properties, + volume_id, scheduler_method, method_args, + exc_info=None): + """Attempt to re-schedule a volume operation.""" + + retry = filter_properties.get('retry', None) + if not retry: + # no retry information, do not reschedule. + LOG.debug(_("Retry info not present, will not reschedule"), + volume_id=volume_id) + return + + if not request_spec: + LOG.debug(_("No request spec, will not reschedule"), + volume_id=volume_id) + return + + request_spec['volume_id'] = [volume_id] + + LOG.debug(_("Re-scheduling %(method)s: attempt %(num)d") % + {'method': scheduler_method.func_name, + 'num': retry['num_attempts']}, volume_id=volume_id) + + # reset the volume state: + now = timeutils.utcnow() + self.db.volume_update(context, volume_id, + {'status': 'creating', + 'scheduled_at': now}) + + if exc_info: + # stringify to avoid circular ref problem in json serialization: + retry['exc'] = traceback.format_exception(*exc_info) + + scheduler_method(context, *method_args) + return True + def delete_volume(self, context, volume_id): """Deletes and unexports volume.""" context = context.elevated() @@ -408,23 +535,14 @@ class VolumeManager(manager.SchedulerDependentManager): volume_ref['name'] not in volume_ref['provider_location']): self.driver.ensure_export(context, volume_ref) - def _copy_image_to_volume(self, context, volume, image_id): + def _copy_image_to_volume(self, context, volume, image_service, image_id): """Downloads Glance image to the specified volume. """ volume_id = volume['id'] - payload = {'volume_id': volume_id, 'image_id': image_id} - try: - image_service, image_id = glance.get_remote_image_service(context, - image_id) - self.driver.copy_image_to_volume(context, volume, image_service, - image_id) - LOG.debug(_("Downloaded image %(image_id)s to %(volume_id)s " - "successfully") % locals()) - self.db.volume_update(context, volume_id, - {'status': 'available'}) - except Exception, error: - with excutils.save_and_reraise_exception(): - payload['message'] = unicode(error) - self.db.volume_update(context, volume_id, {'status': 'error'}) + self.driver.copy_image_to_volume(context, volume, + image_service, + image_id) + LOG.debug(_("Downloaded image %(image_id)s to %(volume_id)s " + "successfully") % locals()) def copy_volume_to_image(self, context, volume_id, image_meta): """Uploads the specified volume to Glance. diff --git a/cinder/volume/rpcapi.py b/cinder/volume/rpcapi.py index d9f72f0a0..b4098c8b8 100644 --- a/cinder/volume/rpcapi.py +++ b/cinder/volume/rpcapi.py @@ -36,6 +36,8 @@ class VolumeAPI(cinder.openstack.common.rpc.proxy.RpcProxy): 1.1 - Adds clone volume option to create_volume. 1.2 - Add publish_service_capabilities() method. 1.3 - Pass all image metadata (not just ID) in copy_volume_to_image + 1.4 - Add request_spec, filter_properties and + allow_reschedule arguments to create_volume(). ''' BASE_RPC_API_VERSION = '1.0' @@ -46,18 +48,23 @@ class VolumeAPI(cinder.openstack.common.rpc.proxy.RpcProxy): default_version=self.BASE_RPC_API_VERSION) def create_volume(self, ctxt, volume, host, + request_spec, filter_properties, + allow_reschedule=True, snapshot_id=None, image_id=None, source_volid=None): self.cast(ctxt, self.make_msg('create_volume', volume_id=volume['id'], + request_spec=request_spec, + filter_properties=filter_properties, + allow_reschedule=allow_reschedule, snapshot_id=snapshot_id, image_id=image_id, source_volid=source_volid), topic=rpc.queue_get_for(ctxt, self.topic, host), - version='1.1') + version='1.4') def delete_volume(self, ctxt, volume): self.cast(ctxt, diff --git a/setup.py b/setup.py index 2770d554f..691bc388b 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,8 @@ filters = [ "cinder.scheduler.filters.capacity_filter:CapacityFilter", "JsonFilter = " "cinder.openstack.common.scheduler.filters.json_filter:JsonFilter", + "RetryFilter = " + "cinder.scheduler.filters.retry_filter:RetryFilter", ] weights = [