Accela Zhao <accelazh@gmail.com>
Adam C. Emerson <aemerson@linuxbox.com>
Adam Crume <adamcrume@gmail.com>
+Adam Kupczyk <akupczyk@mirantis.com>
Adam Manzanares <nmtadam@gmail.com>
Adam Spiers <aspiers@suse.com>
Adam Twardowski <adam.twardowski@gmail.com>
Anton Aksola <anton.aksola@nebula.fi>
Anton Blanchard <anton@samba.org>
apovzner <apovzner@29311d96-e01e-0410-9327-a35deaab8ce9>
+Aran85 <zhangzengran@h3c.com>
Ariela <Dell@ARIELA.(none)>
Aristoteles Neto <aristoteles.neto@webdrive.co.nz>
Armando Segnini <armaseg@gmail.com>
BJ Lougee <almightybeeij@gmail.com>
Bjørnar Ness <bjornar.ness@gmail.com>
Blaine Gardner <blaine.gardner@hp.com>
+blinke <Burkhard.Linke@computational.bio.uni-giessen.de>
Bo Cai <cai.bo@h3c.com>
Boris Ranto <branto@redhat.com>
Brad Hubbard <bhubbard@redhat.com>
Dan Chai <tengweicai@gmail.com>
Daniel Gollub <d.gollub@telekom.de>
Daniel Gryniewicz <dang@fprintf.net>
+Daniel Gryniewicz <dang@redhat.com>
Daniel J. Hofmann <daniel@trvx.org>
Dan Mick <dan.mick@inktank.com>
Dan Mick <dmick@redhat.com>
Dongsu Park <dpark1978@gmail.com>
Dong Yuan <yuandong1222@gmail.com>
Douglas Fuller <dfuller@redhat.com>
+Dunrong Huang <riegamaths@gmail.com>
dwj192 <duanweijun@h3c.com>
Eleanor Cawthon <eleanor.cawthon@inktank.com>
Emily Popper <emily.popper@dreamhost.com>
Guang Yang <yguang@yahoo-inc.com>
guce <guce@h3c.com>
Guilhem Lettron <guilhem@lettron.fr>
+Gu Zhongyan <guzhongyan@360.cn>
Haifeng Liu <haifeng@yahoo-inc.com>
Hannes Reinecke <hare@suse.de>
Hannu Valtonen <hannu.valtonen@ormod.com>
Igor Fedotov <ifedotov@mirantis.com>
Ilya Dryomov <idryomov@redhat.com>
Ilya Dryomov <ilya.dryomov@inktank.com>
+Ira Cooper <ira@redhat.com>
islepnev <islepnev@gmail.com>
James Page <james.page@ubuntu.com>
James Ryan Cresawn <jrcresawn@gmail.com>
Javier M. Mellid <jmunhoz@igalia.com>
Jeff Weber <jweber@cofront.net>
Jenkins Build Slave User <jenkins-build@jenkins-slave-wheezy.localdomain>
+Jenkins Build Slave User <jenkins-build@trusty-huge--349f4788-0128-42d9-aca5-78c0aad008e6.localdomain>
+Jenkins Build Slave User <jenkins-build@trusty-huge--d6d4765c-3c95-4fe1-a8cb-776094e0d416.localdomain>
+Jenkins Build Slave User <jenkins-build@trusty-small--296081f3-e7b1-46b1-96c2-d6b655cf71d3.localdomain>
Jenkins <jenkins@ceph.com>
Jenkins <jenkins@inktank.com>
Jens-Christian Fischer <jens-christian.fischer@switch.ch>
Martin Ettl <ettl.martin@gmx.de>
Matt Benjamin <matt@cohortfs.com>
Matt Benjamin <matt@linuxbox.com>
+Matt Benjamin <mbenjamin@redhat.com>
Matthew Roy <matthew@royhousehold.net>
Matthew Wodrich <matthew.wodrich@dreamhost.com>
Matt Richards <mattjrichards@gmail.com>
Mouad Benchchaoui <m.benchchaoui@x-ion.de>
Mykola Golub <mgolub@mirantis.com>
Mykola Golub <mgolub@zhuzha.mirantis.lviv.net>
+Mykola Golub <to.my.trociny@gmail.com>
nairolf21 <fcoste21@gmail.com>
Nathan Cutler <ncutler@suse.com>
Nathan Cutler <ncutler@suse.cz>
+Nathan Cutler <presnypreklad@gmail.com>
Neil Horman <nhorman@tuxdriver.com>
Neil Levine <neil.levine@inktank.com>
Nikola Kotur <kotnick@gmail.com>
renhwztetecs <rhwlyw@163.com>
riccardo80 <riccardo80@29311d96-e01e-0410-9327-a35deaab8ce9>
Riccardo Ferretti <rferrett@soe.ucsc.edu>
+Richard W.M. Jones <rjones@redhat.com>
ritz303 <ritz_303@yahoo.com>
Roald J. van Loon <roald@roaldvanloon.nl>
RobertJansen1 <r.jansen86@gmail.com>
Roman Haritonov <reclosedev@gmail.com>
Ron Allred <rallred@itrefined.com>
Rongze Zhu <zrzhit@gmail.com>
+root <root@ceph-node1.homeoffice.wal-mart.com>
root <root@phenom.dyweni.com>
Ross Turk <ross.turk@inktank.com>
Ross Turk <rturk@redhat.com>
Sebastien Han <sebastien.han@enovance.com>
Sebastien Ponce <sebastien.ponce@cern.ch>
Sharif Olorin <sio@tesser.org>
+shawn <cxwshawn@gmail.com>
Shawn Edwards <lesser.evil@gmail.com>
shishir gowda <shishir.gowda@sandisk.com>
+shun-s <songshun134@126.com>
+shun-s <song.shun3@zte.com.cn>
Shu, Xinxin <xinxin.shu@intel.com>
Shylesh Kumar <shmohan@redhat.com>
Simone Gotti <simone.gotti@gmail.com>
Vicente Cheng <freeze.bilsted@gmail.com>
Vikhyat Umrao <vumrao@redhat.com>
Viktor Suprun <popsul1993@gmail.com>
+Vitja Makarov <vitja.makarov@gmail.com>
Volker Assmann <volker@twisted-nerve.de>
VRan Liu <gliuwr@gmail.com>
Vu Pham <vu@mellanox.com>
Xiaowei Chen <cxwshawn@gmail.com>
Xiaoxi Chen <xiaoxi.chen@intel.com>
xiexingguo <258156334@qq.com>
+xie xingguo <xie.xingguo@zte.com.cn>
Xihui He <xihuihe@gmail.com>
Xing Lin <xinglin@cs.utah.edu>
Xinze Chi <xinze@xksy.com>
+Xinze Chi <xinze@xsky.com>
Xinze Chi <xmdxcxz@gmail.com>
Xiong Yiliang <xiongyiliang@xunlei.com>
yangruifeng <yangruifeng.09209@h3c.com>
Yuri Weinstein <yuri.weinstein@inktank.com>
Zhe Zhang <zzxuanyuan@gmail.com>
Zhiqiang Wang <zhiqiang.wang@intel.com>
+Zhi Zhang <willzzhang@tencent.com>
Zhi Zhang <zhangz.david@outlook.com>
zqkkqz <zheng.qiankun@h3c.com>
-e832001 (HEAD, tag: v0.94.6, origin/hammer) 0.94.6
+fe6d859 (HEAD, tag: v0.94.9, origin/hammer) 0.94.9
+27d8055 Revert "moved to use boost uuid implementation, based on commit 4fe89a7b14c97b2ed7f357132901beb2bdcec551"
+21f6f1d Revert "uuid: use boost::random:random_device"
+a219cf5 doc: release-notes.rst: add missing line to v0.94.8
+a6ba101 doc: add missing changelogs up to 0.94.8
+f3dad33 doc: release-notes: add missing hammer releases
+838cd35 (tag: v0.94.8) 0.94.8
+5248929 rocksdb: disable tcmalloc if disabled
+fdfcd9b ceph.spec: respect CEPH_EXTRA_CONFIGURE_ARGS
+d5274a3 rgw: fix subuser rm command failure
+f963774 rgw: add a method to purge all associate keys when removing a subuser
+0d4b601 doc: fix by-parttypeuuid in ceph-disk(8) nroff
+a3003f6 rgw: reset return code in when iterating over the bucket the objects
+64211fa rgw: fix compilation
+3e45c6b rgw: add bucket_quota to RGWRegionMap::encode_json
+699b7c8 rgw: Have a flavor of bucket deletion to bypass GC and to trigger object deletions async.
+81aef60 rgw: remove bucket index objects when deleting the bucket
+23498a9 mon/OSDMonitor: avoid potential expensive grace calculation
+1b6f6f2 mon/OSDMonitor: improve reweight_by_utilization() logic
+474abb8 OSDMonitor: drop pg temp from sources other than the current primary
+b31ac2d osd: reset session->osdmap if session is not waiting for a map anymore
+3a30ffc qa: Add test for #13829
+f70e4ad common: Allow config set with negative value
+0498969 log: do not repeat errors to stderr
+2633ec3 mds: disallow 'open truncate' non-regular inode
+3f0fb20 mds: only open non-regular inode with mode FILE_MODE_PIN
+2c18015 rgw: fix multi-delete query param parsing.
+8a39e5e configure: Add -D_LARGEFILE64_SOURCE to Linux build.
+3bb248b replcatedBackend: delete one useless op->mark_started as there are two in ReplicatedBackend::sub_op_modify_impl delete one mark_start event as there are two same op->mark_started in ReplicatedBackend::sub_op_modify_impl Fixes: http://tracker.ceph.com/issues/16572
+ed4ca7c rgw: Set Access-Control-Allow-Origin to a Asterisk if allowed in a rule
+b78a1be mon: Monitor: validate prefix on handle_command()
+850881c rgw: fix subuser rm command failure
+055427c Pipe: take a ref to existing while we are waiting
+24cc4f9 rgw: check for -ERR_NOT_MODIFIED in rgw_rest_s3.cc
+7dbace5 erasure-code: s/unreocvery/unfound/
+e726f21 test: add test-case for repair unrecovery-ec pg.
+40b1c2b osd: Remove the duplicated func MissingLoc::get_all_missing.
+47d5dfc osd: Fix ec pg repair endless when met unrecover object.
+187d308 uuid: use boost::random:random_device
+174de7f moved to use boost uuid implementation, based on commit 4fe89a7b14c97b2ed7f357132901beb2bdcec551
+5cd922c qa/workunits/rbd: respect RBD_CREATE_ARGS environment variable
+1ac920b rgw: fix identification of canceled operation
+a38f157 rgw: identify racing writes when using copy-if-newer
+02f6d8a rgw: translate http error 304 to appropriate status
+7319d76 rgw: fix if-modified-since boundary
+5e4de5a rgw: add rgwx-copy-if-newer http header
+006ea56 Revert "hammer: Scrub error: 0/1 pinned"
+c294bd3 ReplicatedPG: adjust num_pinned in _delete_oid
+43d1b92 test: Fix grouping of mon_cephdf_commands by moving to MON_TESTS
+300c111 rgw: convert plain object to versioned (with null version) when removing
+4eea92b rgw: handle stripe transition when flushing final pending_data_bl
+f6076dd mds: wrongly treat symlink inode as normal file/dir when symlink inode is stale on kcephfs
+ce313cd rgw: handle errors properly during GET on Swift's DLO.
+410ff15 osdc/Objecter: upper bound watch_check result
+d3eae0a osd: fix omap digest compare when scrub
+dd29310 rgw: keep track of written_objs correctly
+c2ea6db osd: remove all stale osdmaps in handle_osd_map()
+ac0340a osd: populate the trim_thru epoch using MOSDMap.oldest_map
+bb5e015 osd: dump full map bl at 20 when crc doesn't match
+5057c34 obj_bencher: cosmetic display fixes
+6d8ad0e common: Add space between timestamp and "min lat:" in bench output
+3184998 [MON] Fixed calculation of %USED. Now it is shows (space used by all replicas)/(raw space available on OSDs). Before it was (size of pool)/(raw space available on OSDs).
+fed256e mon: add a column 'RAW USED' for ceph df detail
+139691c src/test/objectstore/store_test.cc: fix shards for new tests
+221efb0 doc: s/by-parttype-uuid/by-parttypeuuid/ in ceph-disk
+d56bdf9 (tag: v0.94.7) 0.94.7
+62f4fbe store_test: improve synthetic coverage of the ec stash case
+b6bc9cb store_test: improve synthetic test coverage of long object names
+ec74c12 TestRados: make half the objects have long names
+9d1ee7c LFNIndex::lfn_translate: consider alt attr as well
+6b821cc LFNIndex::created: fix return value with failure injection
+f500435 store_test: add reproducer for #14766
+cbd5aaf osd/PG: update info.stats.* mappings on split
+d1ab71f hammer: rgw: S3: set EncodingType in ListBucketResult
+df4eadc rados: Add units to rados bench output
+76c33de OSDMonitor: avoid underflow in reweight-by-utilization if max_change=1
+d96086a PGLog::rewind_divergent_log: fix rollback_info_trimmed_to before index()
+e79162d TestPGLog: add test for 13965
+fb1b40f osd/Replicated: For CEPH_OSD_OP_WRITE, set data digest.
+f024259 osd/ReplicatedPG: For obj has omap, it mean have omap data or omap header or have both.
+7b3f1da mon/MonClient: fix shutdown race
+ec02d8b PG: set epoch_created and parent_split_bits for child pg
+049bc8a ceph-fuse: double decreased the count to trim caps
+e20df80 osd/ReplicatedPG: make handle_watch_timeout no-op if !active
+3cb72dd mon/OSDMonitor.cc: fix UNINTENDED_INTEGER_DIVISION
+aab3a40 hammer: rbd snap rollback: restore the link to parent
+3c03eee rgw:bucket link now set the bucket.instance acl
+488a787 ECBackend: send subop writes and reads at normal priority
+a2e7ca1 common/Cycles: Do not initialize Cycles globally.
+ca0beef osd/OSD: fix build_past_intervals_parallel
+fce7902 osd: When generating past intervals due to an import end at pg epoch
+2c97cb3 rgw: fix compiling error
+2aa1ea6 rgw: Multipart ListPartsResult ETag quotes
+365f21b tests: be more generous with test timeout
+c722d00 rgw: user quota may not adjust on bucket removal
+77a4ed0 ceph.spec.in: disable lttng and babeltrace explicitly
+97f474f cls_rbd: protect against excessively large object maps
+ac3569c hammer: monclient: avoid key renew storm on clock skew
+20f300e rgw: Do not send a Content-Length header on a 304 response
+e53751d rgw: Do not send a Content-Type on a '304 Not Modified' response
+19dbc25 rgw: dump_status() uses integer
+c79b481 rgw: move status_num initialization into constructor
+ceb8e19 rgw: Do not send a Content-Length header on status 204
+3ecdedd mds: fix stray purging in 'stripe_count > 1' case
+f28477c rgw: do not abort when user passed bad parameters to set quota
+9786394 rgw: do not abort when user passed bad parameters to set metadata
+f8d2abd osd/osd_types: encode pg_pool_t the old way
+720a090 mon: disable gmt_hitset if not supported
+7aec079 osd: do not let OSD_HITSET_GMT reuse the feature bit
+3704341 osd: Decode use_gmt_hitset with a unique version
+64bca2a mon: print use_gmt_hitset in "ceph osd pool get"
+87df212 mon: add "ceph osd pool set $pool use_gmt_hitset true" cmd
+0392404 osd: use GMT time for the object name of hitsets
+744e9f8 test/bufferlist: do not expect !is_page_aligned() after unaligned rebuild
+0830275 osd/PG: fix generate_past_intervals
+7eae05e osd/ReplicatedPG: do not proxy read *and* process op locally
+be4a9fe osd/OSDMap: fix typo in summarize_mapping_stats
+2072a53 qa/workunits: remove 'mds setmap' from workunits
+01672b4 mon: Monitor: get rid of weighted clock skew reports
+f90b8bc mon: Monitor: adaptative clock skew detection interval
+57fd7f8 test/librados/test.cc: clean up EC pools' crush rules too
+d4cf190 keyring permissions for mon daemon
+1b922e5 test/pybind/test_ceph_argparse: fix reweight-by-utilization tests
+06a2a75 man/8/ceph.rst: remove invalid option for reweight-by-*
+241f762 mon: remove range=100 from reweight-by-* commands
+55ad2c7 mon: make max_osds an optional arg
+f13cdea mon: make reweight max_change default configurable
+f4b4ef7 mon/OSDMonitor: fix indentation
+76eb3c8 qa/workunits/cephtool/test.sh: test reweight-by-x commands
+9a9d147 osd/MonCommand: add/fix up 'osd [test-]reweight-by-{pg,utilization}'
+6ec676d mon: add 'osd utilization' command
+94134d9 osd/OSDMap: add summarize_mapping_stats
+932f75d mon: make reweight-by-* max_change an argument
+d8372ce osd: add mon_reweight_max_osds to limit reweight-by-* commands
+6a422b2 osd: add mon_reweight_max_change option which limits reweight-by-*
+d3635b7 test: add simple test for new reweight-by-* options
+e993851 osd: add sure and no-increasing options to reweight-by-*
+281d63d librbd: complete cache reads on cache's dedicate thread
+621e3ae test: reproducer for writeback CoW deadlock
+38b9c0b osdc/Objecter: call notify completion only once
+f794ada tests: Add TEST_no_segfault_for_bad_keyring to test/mon/misc.sh
+94da46b tests: make sure no segfault occurs when using some bad keyring
+a371c0f auth: fix a crash issue due to CryptoHandler::create() failed
+af5da4f auth: fix double PK11_DestroyContext() if PK11_DigestFinal() failed
+c3f031a ceph-objectstore-tool, osd: Fix import handling
+647723e tools, test: Add ceph-objectstore-tool to operate on the meta collection
+d875620 common/obj_bencher.cc: make verify error fatal
+04fe951 test/test_rados_tool.sh: force rados bench rand and seq
+6a6754f hammer: tools: fix race condition in seq/rand bench (part 2)
+3a5b102 hammer: tools: fix race condition in seq/rand bench (part 1)
+c4ba93a client: use fuse_req_getgroups() to get group list
+a84ed87 client: use thread local data to track fuse request
+e7f299a client/Client.cc: remove only once used variable
+16e3e2f client/Client.cc: fix realloc memory leak
+b13ddc0 client: added permission check based on getgrouplist
+562c0a9 configure.ac: added autoconf check for getgrouplist
+e014ea8 init-ceph: check if /lib/lsb/init-functions exists
+5726463 packaging: lsb_release build and runtime dependency
+c63baeb global: do not start two daemons with a single pid-file (part 2)
+9282c1d ceph-objectstore-tool: Add dry-run checking to ops missing it
+efc2183 test: Remove redundant test output
+3226615 test: osd-scrub-snaps.sh uses ceph-helpers.sh and added to make check
+995a004 test: Verify replicated PG beyond just data after vstart
+6afb5d3 osd: Use boost::optional instead of snap 0 for "all_clones"
+750f817 test: Fix verify() used after import to also check xattr and omap
+b8c9507 osd, test: When head missing a snapset, clones not an error
+59fee8a test: Add test cases for xattr and omap ceph-objectstore-tool operations
+0988b12 osd, test: Keep missing count and log number of missing clones
+37be959 rados: Minor output changes for consistency across operations
+6c51e48 test: Eliminate check for bogus "obj13/head snaps empty" error
+e92505b ceph-objectstore-tool: Add new remove-clone-metadata object op
+8f88b44 osd: Fix trim_object() to not crash on corrupt snapset
+78b13f5 ceph-objectstore-tool: Improve object spec error handling
+7b800b7 ceph-objectstore-tool: Add undocumented clear-snapset command for testing
+7f398bd ceph-objectstore-tool: Add set-size command for objects
+53dc87f ceph-objectstore-tool: Enhanced dump command replaces dump-info
+a46fc66 test: Add some clones to ceph-objectstore-tool test
+fd518e7 ceph-objectstore-tool: For corrupt objectstores, don't abort listing on errors
+ad7825a ceph-objectstore-tool: Improve some error messages
+26cbf14 ceph-objectstore-tool: White space fixes
+0f78564 tools/rados: Improve xattr import handling so future internal xattrs ignored
+c8e2772 test: Test scrubbing of snapshot problems
+113d5c7 osd: Don't crash if OI_ATTR attribute is missing or corrupt
+3af8f9e osd: Additional _scrub() check for snapset inconsistency
+7103e74 osd: Better SnapSet scrub checking (find issues instead of asserting)
+18af852 osd: Make the _scrub routine produce good output and detect errors properly
+3a1b588 osd: Fix log message name of ceph-objectstore-tool
+0fe3dfe ceph-objectstore-tool: add mark-complete operation
+1bc8882 test: Fix failure test to find message anywhere in stderr
+6ff4217 test: add test for {get,set}-inc-osdmap commands.
+de80bbf test: Add debug argument to the ceph-objectstore-tool test
+0643797 rados: Fix usage for "notify" command
+5ba8649 test: add test for {get,set}-osdmap commands
+3276258 tools, test: Some ceph-objectstore-tool error handling fixes
+cfabcc1 tools/ceph-objectstore-tool: add get-inc-osdmap command
+c7d0fda tools: Check for valid --op earlier so we can get a better error message
+be24c50 tools/ceph-objectstore-tool: add set-inc-osdmap command
+06dcf74 tools: Fix newlines in output of --op list
+e44c042 tools/ceph-objectstore-tool: add "get-osdmap" command
+3f9e467 tools: Fix dump-super which doesn't require pgid
+c60eee1 tools/ceph-objectstore-tool: add "set-osdmap" command
+cfe7d47 tools: Check and specify commands that require the pgid specification
+df0e11e hobject_t: modify operator<<
+6c8884b test, tools: Improve ceph-objectstore-tool import error handling and add tests
+87a7f99 tools: For ec pools list objects in all shards if the pgid doesn't specify
+9ca2f35 tools: clean up errors in ceph-objectstore-tool
+78a59f8 test/ceph-objectstore-tool: Don't need stderr noise
+eab0f24 osd: Show number of divergent_priors in log message
+d58793d osd, tools: Always filter temp objects since not being exported
+efc402e test/ceph-objectstore-tool: Show command that should have failed
+88ac519 test: Add config changes to all tests to avoid order dependency
+3d99ecd tools: Don't export temporary objects until we have persistent-temp objects
+13360d3 test/ceph_objectstore_tool: Improve dump-journal testing
+444ce0a ceph-objectstore-tool: Allow --pgid specified on import (for post split)
+aed1c49 ceph-objectstore-tool: Invalidate pg stats when objects were skipped during pg import
+af3f8ae ceph-objectstore-tool: Add dump-super to show OSDSuperblock in format specified
+4dcf15b mds, include: Fix dump() numeric char array to include additional alpha chars
+feecacf ceph-objectstore-tool: Add dump-journal as not requiring --pgid in usage
+5e8fbb1 test: ceph_test_filejournal: Conform to test infrastructure requirements
+c161cbf test: ceph_test_filejournal need to force aio because testing with a file
+06d3f51 test: ceph_test_filejournal fix missing argument to FileJournal constructor
+2078f63 test: ceph_test_filejournal Add check of journalq in WriteTrim test
+ab893d7 test: Fix ceph-objectstore-tool test missing fd.close()
+b5f2ccd test: Fix ceph-objectstore-tool test error message
+848822d test: ceph-objectstore-tool: Remove duplicate debug messages, keep cmd/log/call together
+771dcd9 test: ceph-objectstore-tool import after split testing
+4f387b1 test: Use CEPH_DIR where appropriate
+b337d67 test: Limit how long ceph-objectstore-tool test will wait for health
+09cb8a4 test: Add optional arg to vstart() to provide additional args to vstart
+b4ac42b test: Test ceph-objectstore-tool --op dump-journal output
+729abf5 test: Pep8 fixes for ceph-objectstore-tool test
+33813b6 test: Fix ceph-objectstore-tool test, overwrite OTHERFILE so second check is meaningful
+f7ab316 osd: FileJournal: Add _fdump() that takes Formatter instead of ostream
+99d3e17 osd: Add simple_dump() to FileJournal for unit testing
+80fc57f osd: FileJournal clean-up
+b8f4ea1 osd: Dump header in FileJournal::dump()
+21c3c18 osd: FileJournal::read_entry() can't use a zero seq to check for corruption
+288902f osd: Fix flushing in FileJournal::dump()
+a935ce5 osd: Add admin socket feature set_recovery_delay
+4ae3f88 ceph-objectstore-tool: For import/export --debug dump the log
+cc5fa68 ceph-objectstore-tool: If object re-appears after removal, just skip it
+d8ae1a9 ceph-objectstore-tool: Add --no-overwrite flag for import-rados
+2dbf843 ceph-objectstore-tool: Remove list-lost because now we have --dry-run flag
+3599174 ceph-objectstore-tool: Add --dry-run option
+05d3b73 ceph-objectstore-tool: Add dump-info command to show object info
+2d764c5 ceph-objectstore-tool: Use empty string for <object> to specify pgmeta object
+3a533d7 ceph-objectstore-tool: Add a couple of strategically placed prints
+7947f4f ceph-objectstore-tool: Clean up error handling
+83de86e ceph-objectstore-tool: Create section around log/missing/divergent_priors of --op log
+ddfaa70 ceph-objectstore-tool: Add divergent_priors handling
+add937c ceph-objectstore-tool: Add --force option which is used for import only
+f332748 ceph-objectstore-tool: Fix pgid scan to skip snapdirs
+3e68825 ceph-objectstore-tool: Add dump-journal op
+aaff4d7 ceph-objectstore-tool: On any exit release CephContext so logging can flush
+7445cf5 ceph-objectstore-tool: Check for keyvaluestore experimental feature
+9da6c01 ceph-objectstore-tool: Eliminate obscure "Invalid params" error
+c5ac7ce ceph-objectstore-tool: Check pgid validity earlier like we did before
+18c49b6 Backport the merge commit of branch 'wip-journal-header' of git://github.com/XinzeChi/ceph
+cf433ba global/pidfile: do not start two daemons with a single pid-file
+b43c5b2 unittest_crypto: benchmark 100,000 CryptoKey::encrypt() calls
+e832001 (tag: v0.94.6) 0.94.6
+a1fc101 crushtool: send --tree to stdout
+4fb688d osd: write journal header by force when journal write close
+31a2fc4 common/bit_vector: use hard-coded value for block size
+3352b14 ceph.in: Notify user that 'tell' can't be used in interactive mode
+14b5fea mon/LogMonitor: use the configured facility if log to syslog
+10d29da os/LevelDBStore:fix bug when compact_on_mount
+d5ba063 OSDMap: reset osd_primary_affinity shared_ptr when deepish_copy_from
+9e0a165 OSD::consume_map: correctly remove pg shards which are no longer acting
+5a450e6 mon: add mon_config_key prefix when sync full
+b9a4ad9 Fixed the ceph get mdsmap assertion.
9ab5fd9 rgw-admin: document orphans commands in usage
0e1378e [backport] rgw: fix wrong etag calculation during POST on S3 bucket.
5c8d1d7 [backport] rgw: Make RGW_MAX_PUT_SIZE configurable
f2ca42b doc: add orphans commands to radosgw-admin(8)
e42ed6d man: rebuild manpages
a8fc6a9 fsx: checkout old version until it compiles properly on miras
+eb048a3 qa/workunits/post-file.sh: sudo
+e9039f4 qa/workunits/post-file: pick a dir that's readable by world
+902abe7 qa/workunits/post-file.sh: use /etc/default
+1c8c708 librbd: ensure librados callbacks are flushed prior to destroying image
+f892566 librbd: simplify IO flush handling
+e5dfd3d WorkQueue: PointerWQ drain no longer waits for other queues
+edf60b4 test: new librbd flatten test case
+88ffcc2 ceph-disk: use blkid instead of sgdisk -i
1b02859 qa/fsstress.sh: fix 'cp not writing through dangling symlink'
+f209819 [ceph-fuse] fix ceph-fuse writing to stale log file after log rotation
9109304 mon: compact full epochs also
2817ffc Check for full before changing the cached obc
ae56de0 osd: recency should look at newest (not oldest) hitsets
a5e4f70 man: document listwatchers cmd in "rados" manpage
46d626d rbd: remove canceled tasks from timer thread
24c0b27 rbd-replay: handle EOF gracefully
+3d84420 PG::activate(): handle unexpected cached_removed_snaps more gracefully
+ad4df3b rgw: warn on suspicious civetweb frontend parameters
70f1ba3 tools: monstore: add 'show-versions' command.
9260171 tools: ceph_monstore_tool: add inflate-pgmap command
a1d5728 tools:support printing the crushmap in readable fashion.
53742bd ceph_osd: Add required feature bits related to this branch to osd_required mask
3066231 osd: CEPH_FEATURE_CHUNKY_SCRUB feature now required
6379ff1 configure.ac: no use to add "+" before ac_ext=c
+5c92d1d rgw: Add default quota config
f96c812 rgw: fix reload on non Debian systems.
cbb5c1f Fixing NULL pointer dereference
+17d1b0d rgw: radosgw-admin bucket check --fix not work
b2961ce rbd: fix bench-write
9cee89b Check that delta_sum.stats.sum.num_object_copies and delta_sum.stats.sum.num_object are greater than zero
1ab2b48 ReplicatedPG: fix sparse-read result code checking logic
86f5cf6 osd: clear pg_stat_queue after stopping pgs
b0856ee osd: Test osd_find_best_info_ignore_history_les config in another assert
b2f1e76 Compare parted output with the dereferenced path
+df3f971 auth: return error code from encrypt/decrypt; make error string optional
+224bb39 auth: optimize crypto++ key context
+f11718d auth/Crypto: optimize libnss key
+d1b6096 auth: refactor crypto key context
+3249f48 auth/cephx: optimize signature check
+51eaf98 auth/cephx: move signature calc into helper
+c240da9 auth/Crypto: avoid memcpy on libnss crypto operation
+86cc0f0 auth: make CryptoHandler implementations totally private
5264bc6 mon: OSDMonitor: do not assume a session exists in send_incremental()
4d0b9a1 log: Log.cc: Assign LOG_DEBUG priority to syslog calls
26e832e librbd: fix merge-diff for >2GB diff-files
+f04e007 osd: log inconsistent shard sizes
a9d3f07 osd/osd_types: skip encoding newer object_info_t fields if they are unused
1548a3f osd/ReplicatedPG: do not set local_mtime on non-tiered pool
98bdb09 osd/PGBackend: use mtime for digest decision if local_mtime is empty
e53d66e packaging: add new tracepoint probe shared libraries
bb7c0f5 ceph.spec.in: add new tracepoint probe shared libraries
e1da271 lttng: move tracepoint probes to dynamic libraries
+b2393dc client: add InodeRef.h to make dist
8358fb8 revert: osd: use GMT time for hitsets
4420929 rgw: fix modification to index attrs when setting acls
8378aaf build/ops: rbd-replay moved from ceph-test-dbg to ceph-common-dbg
6a40e4f ceph.spec.in: lttng in SLES12 only
e508a44 ceph.spec.in: fix lttng/babeltrace conditionals
19c9546 packaging: move rbd-replay* to ceph-common
+fa79bd9 client: use smart pointer to track 'cwd' and 'root_parents'
+fcafc65 client: convert Inode::snapdir_parent to smart pointer
+78cca4e client: use smart pointer to track temporary inode reference
+bad6f33 client: convert CapSnap::in to smart pointer
+4bb48ee client: convert Fh::inode to smart pointer
+5bebb3a client: use smart pointers in MetaRequest
+e18f1ae client: convert Dentry::inode to smart pointer
+a7f6d2f client: hold reference for returned inode
3d3595f krbd: remove deprecated --quiet param from udevadm
4d81cd1 run_cmd: close parent process console file descriptors
2052187 init-rbdmap: Rewrite to use logger + clean-up
19be358 PG::actingset should be used when checking the number of acting OSDs for a given PG. Signed-off-by: Guang Yang <yguang@yahoo-inc.com>
8253ead osdc/Objecter: use SafeTimer; make callbacks race-tolerant
6c37984 mailmap: Yehuda Sadeh name normalization
-beff616 ceph-disk: set guid if reusing a journal partition
+beff616f ceph-disk: set guid if reusing a journal partition
50e8040 tools: rados put /dev/null should write() and not create()
0b0a373 mailmap: update email address
188370a doc/release-notes: fix attributions for 8702 fix
2e3302c doc: Updated the example configuration.
5a31df2 doc: Updated doc for more recent versions.
2eab1c1 Update RBD doc for OpenStack
-a290d34 test_librbd_fsx: fix sign-compare gcc warning
+a290d349 test_librbd_fsx: fix sign-compare gcc warning
40c48bc qa: add script to test krbd setting ro/rw ioctl
b2542f8 rgw: set a default data extra pool name
94c8f70 doc: Made mention of "incomplete" status.
cfc1f2e rgw: modify RGWBucketInfo::creation_time encoding / decoding
4089001 rgw: Fix return value for swift user not found
c73a24b rgw: end-marker serves as last value, not as upperbound
-c414030 rgw: relax marker requirements for log apis
+c4140303 rgw: relax marker requirements for log apis
b21a41a rgw: update log APIs to use markers
ce7d816 rgw: cls_log_entry has id field
064e92f Makefile.am: some more makefile rules fixes
5f3ef77 mon: make pool snap creation ops idempotent
53aa959 objecter: return ENOENT/EEXIST on pool snap delete/create
507f99e librados: make snap create/destroy handle client-side errors
-3715d20 mon: check for invalid pool snap creates in preprocess_op, too
+3715d205 mon: check for invalid pool snap creates in preprocess_op, too
640e5fd qa: simple tests for 'ceph osd create|rm' commands
6f7837a mon: make 'osd rm ...' idempotent
4788567 qa: simple test for pool create/delete commands
e07b956 rgw: implement some missing swift api, other swift fixes
5790a36 rgw: encoded swift key contains full user name
f883e63 rgw: multiple swift keys per user
-9b18e55 PG,OSD: clean up xlist::items on pg removal
+9b18e55e PG,OSD: clean up xlist::items on pg removal
b43b864 osd: fix race between op requeueing and _dispatch
f7ec9f7 thread: whitespace
fc3aac6 filestore: clean up error output
3a3ccd8 kclient: checkpatch cleanups
522f570 mds: fix default layout settings
38dbaa5 (tag: v0.16) v0.16
-e678fbc msgr: authorizer get/verify callbacks
+e678fbc1 msgr: authorizer get/verify callbacks
faa5fb5 msgr: get_authorizer hook?
56f45b4 objecter: Session type
8b04f9a auth: authorizer for osd
%bcond_with ocf
%bcond_without cephfs_java
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
+
%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
Name: ceph
-Version: 0.94.6
+Version: 0.94.9
Release: 0%{?dist}
Epoch: 1
Summary: User space components of the Ceph file system
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-%else
+Requires: lsb-release
+BuildRequires: lsb-release
+%endif
+%if 0%{?fedora} || 0%{?rhel}
Requires: gdisk
BuildRequires: nss-devel
BuildRequires: keyutils-libs-devel
Requires(preun):initscripts
BuildRequires: gperftools-devel
Requires: python-flask
+Requires: redhat-lsb-core
+BuildRequires: redhat-lsb-core
%endif
# lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
%if 0%{?fedora} || 0%{?rhel}
BuildRequires: lttng-ust-devel
BuildRequires: libbabeltrace-devel
%endif
./autogen.sh
-MY_CONF_OPT=""
+MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS"
MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
+%if %{without lttng}
+ --without-lttng \
+ --without-babeltrace \
+%endif
%{?_with_ocf} \
CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/libos_tp.so*
%{_libdir}/libosd_tp.so*
%endif
%{_bindir}/rbd
%{_bindir}/rbd-replay
%{_bindir}/rbd-replay-many
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_bindir}/rbd-replay-prep
%endif
%{_bindir}/ceph-post-file
%files -n librados2
%defattr(-,root,root,-)
%{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so.*
%endif
%{_includedir}/rados/rados_types.hpp
%{_includedir}/rados/memory.h
%{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so
%endif
%files -n librbd1
%defattr(-,root,root,-)
%{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so.*
%endif
%{_includedir}/rbd/librbd.hpp
%{_includedir}/rbd/features.h
%{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so
%endif
%bcond_with ocf
%bcond_without cephfs_java
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
+
%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
Name: ceph
Version: @VERSION@
Release: @RPM_RELEASE@%{?dist}
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-%else
+Requires: lsb-release
+BuildRequires: lsb-release
+%endif
+%if 0%{?fedora} || 0%{?rhel}
Requires: gdisk
BuildRequires: nss-devel
BuildRequires: keyutils-libs-devel
Requires(preun):initscripts
BuildRequires: gperftools-devel
Requires: python-flask
+Requires: redhat-lsb-core
+BuildRequires: redhat-lsb-core
%endif
# lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
%if 0%{?fedora} || 0%{?rhel}
BuildRequires: lttng-ust-devel
BuildRequires: libbabeltrace-devel
%endif
./autogen.sh
-MY_CONF_OPT=""
+MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS"
MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
+%if %{without lttng}
+ --without-lttng \
+ --without-babeltrace \
+%endif
%{?_with_ocf} \
CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/libos_tp.so*
%{_libdir}/libosd_tp.so*
%endif
%{_bindir}/rbd
%{_bindir}/rbd-replay
%{_bindir}/rbd-replay-many
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_bindir}/rbd-replay-prep
%endif
%{_bindir}/ceph-post-file
%files -n librados2
%defattr(-,root,root,-)
%{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so.*
%endif
%{_includedir}/rados/rados_types.hpp
%{_includedir}/rados/memory.h
%{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so
%endif
%files -n librbd1
%defattr(-,root,root,-)
%{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so.*
%endif
%{_includedir}/rbd/librbd.hpp
%{_includedir}/rbd/features.h
%{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so
%endif
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 0.94.6.
+# Generated by GNU Autoconf 2.69 for ceph 0.94.9.
#
# Report bugs to <ceph-devel@vger.kernel.org>.
#
# Identity of this package.
PACKAGE_NAME='ceph'
PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.94.6'
-PACKAGE_STRING='ceph 0.94.6'
+PACKAGE_VERSION='0.94.9'
+PACKAGE_STRING='ceph 0.94.9'
PACKAGE_BUGREPORT='ceph-devel@vger.kernel.org'
PACKAGE_URL=''
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ceph 0.94.6 to adapt to many kinds of systems.
+\`configure' configures ceph 0.94.9 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ceph 0.94.6:";;
+ short | recursive ) echo "Configuration of ceph 0.94.9:";;
esac
cat <<\_ACEOF
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ceph configure 0.94.6
+ceph configure 0.94.9
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ceph $as_me 0.94.6, which was
+It was created by ceph $as_me 0.94.9, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.94.6'
+ VERSION='0.94.9'
cat >>confdefs.h <<_ACEOF
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.94.6'
+ VERSION='0.94.9'
cat >>confdefs.h <<_ACEOF
;;
linux*)
linux="yes"
+ CFLAGS="-D_LARGEFILE64_SOURCE ${CFLAGS}"
;;
freebsd*)
freebsd="yes"
JAVA_TEST=Test.java
CLASS_TEST=Test.class
cat << \EOF > $JAVA_TEST
-/* #line 20031 "configure" */
+/* #line 20032 "configure" */
public class Test {
}
EOF
fi
+# getgrouplist
+for ac_func in getgrouplist
+do :
+ ac_fn_c_check_func "$LINENO" "getgrouplist" "ac_cv_func_getgrouplist"
+if test "x$ac_cv_func_getgrouplist" = xyes; then :
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_GETGROUPLIST 1
+_ACEOF
+
+fi
+done
+
+
#
# Test for time-related `struct stat` members.
#
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ceph $as_me 0.94.6, which was
+This file was extended by ceph $as_me 0.94.9, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ceph config.status 0.94.6
+ceph config.status 0.94.9
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.94.6], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.94.9], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
;;
linux*)
linux="yes"
+ CFLAGS="-D_LARGEFILE64_SOURCE ${CFLAGS}"
;;
freebsd*)
freebsd="yes"
[AC_DEFINE([CEPH_HAVE_FALLOCATE], [], [fallocate(2) is supported])],
[])
+# getgrouplist
+AC_CHECK_FUNCS([getgrouplist])
+
#
# Test for time-related `struct stat` members.
#
.SS activate\-all
.sp
Activate all tagged OSD partitions. \fBactivate\-all\fP relies on
-\fB/dev/disk/by\-parttype\-uuid/$typeuuid.$uuid\fP to find all partitions. Special
+\fB/dev/disk/by\-parttypeuuid/$typeuuid.$uuid\fP to find all partitions. Special
\fBudev\fP rules are installed to create these links. It is triggered on ceph
service start or run directly.
.sp
-e832001feaf8c176593e0325c8298e3f16dfb403
-v0.94.6
+fe6d859066244b97b24f09d46552afc2071e6f90
+v0.94.9
@ENABLE_CLIENT_TRUE@ client/SyntheticClient.h \
@ENABLE_CLIENT_TRUE@ client/Trace.h \
@ENABLE_CLIENT_TRUE@ client/ioctl.h \
-@ENABLE_CLIENT_TRUE@ client/ObjecterWriteback.h
+@ENABLE_CLIENT_TRUE@ client/ObjecterWriteback.h \
+@ENABLE_CLIENT_TRUE@ client/InodeRef.h
@ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@am__append_61 = libclient_fuse.la
@ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@am__append_62 = client/fuse_ll.h
client/Dir.h client/Fh.h client/Inode.h client/MetaRequest.h \
client/MetaSession.h client/ClientSnapRealm.h \
client/SyntheticClient.h client/Trace.h client/ioctl.h \
- client/ObjecterWriteback.h client/fuse_ll.h global/pidfile.h \
- global/global_init.h global/global_context.h \
+ client/ObjecterWriteback.h client/InodeRef.h client/fuse_ll.h \
+ global/pidfile.h global/global_init.h global/global_context.h \
global/signal_handler.h json_spirit/json_spirit.h \
json_spirit/json_spirit_error_position.h \
json_spirit/json_spirit_reader.h \
test/mon/osd-pool-create.sh test/mon/misc.sh \
test/mon/osd-crush.sh test/mon/osd-erasure-code-profile.sh \
test/mon/mkfs.sh test/osd/osd-scrub-repair.sh \
- test/osd/osd-config.sh test/osd/osd-bench.sh \
- test/osd/osd-copy-from.sh test/mon/mon-handle-forward.sh \
- $(am__append_181) $(am__append_182) \
- test/pybind/test_ceph_argparse.py
+ test/osd/osd-scrub-snaps.sh test/osd/osd-config.sh \
+ test/osd/osd-bench.sh test/osd/osd-copy-from.sh \
+ test/mon/mon-handle-forward.sh $(am__append_181) \
+ $(am__append_182) test/pybind/test_ceph_argparse.py
##################################
AM_COMMON_CPPFLAGS = \
--log-file $$b.log --trs-file $$b.trs \
$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/osd/osd-scrub-snaps.sh.log: test/osd/osd-scrub-snaps.sh
+ @p='test/osd/osd-scrub-snaps.sh'; \
+ b='test/osd/osd-scrub-snaps.sh'; \
+ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+ --log-file $$b.log --trs-file $$b.trs \
+ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+ "$$tst" $(AM_TESTS_FD_REDIRECT)
test/osd/osd-config.sh.log: test/osd/osd-config.sh
@p='test/osd/osd-config.sh'; \
b='test/osd/osd-config.sh'; \
/* Define to 1 if you have the `fuse_getgroups' function. */
#undef HAVE_FUSE_GETGROUPS
+/* Define to 1 if you have the `getgrouplist' function. */
+#undef HAVE_GETGROUPLIST
+
/* we have a recent yasm and are x86_64 */
#undef HAVE_GOOD_YASM_ELF64
return r;
}
+
// ---------------------------------------------------
-int CryptoNone::create(bufferptr& secret)
-{
- return 0;
-}
+class CryptoNoneKeyHandler : public CryptoKeyHandler {
+public:
+ int encrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const {
+ out = in;
+ return 0;
+ }
+ int decrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const {
+ out = in;
+ return 0;
+ }
+};
+
+class CryptoNone : public CryptoHandler {
+public:
+ CryptoNone() { }
+ ~CryptoNone() {}
+ int get_type() const {
+ return CEPH_CRYPTO_NONE;
+ }
+ int create(bufferptr& secret) {
+ return 0;
+ }
+ int validate_secret(const bufferptr& secret) {
+ return 0;
+ }
+ CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error) {
+ return new CryptoNoneKeyHandler;
+ }
+};
-int CryptoNone::validate_secret(bufferptr& secret)
-{
- return 0;
-}
-void CryptoNone::encrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const
-{
- out = in;
-}
+// ---------------------------------------------------
-void CryptoNone::decrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const
-{
- out = in;
-}
+class CryptoAES : public CryptoHandler {
+public:
+ CryptoAES() { }
+ ~CryptoAES() {}
+ int get_type() const {
+ return CEPH_CRYPTO_AES;
+ }
+ int create(bufferptr& secret);
+ int validate_secret(const bufferptr& secret);
+ CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error);
+};
-// ---------------------------------------------------
#ifdef USE_CRYPTOPP
# define AES_KEY_LEN ((size_t)CryptoPP::AES::DEFAULT_KEYLENGTH)
# define AES_BLOCK_LEN ((size_t)CryptoPP::AES::BLOCKSIZE)
-#elif USE_NSS
-// when we say AES, we mean AES-128
-# define AES_KEY_LEN 16
-# define AES_BLOCK_LEN 16
-static void nss_aes_operation(CK_ATTRIBUTE_TYPE op, const bufferptr& secret,
- const bufferlist& in, bufferlist& out, std::string &error)
-{
- const CK_MECHANISM_TYPE mechanism = CKM_AES_CBC_PAD;
+class CryptoAESKeyHandler : public CryptoKeyHandler {
+public:
+ CryptoPP::AES::Encryption *enc_key;
+ CryptoPP::AES::Decryption *dec_key;
+
+ CryptoAESKeyHandler()
+ : enc_key(NULL),
+ dec_key(NULL) {}
+ ~CryptoAESKeyHandler() {
+ delete enc_key;
+ delete dec_key;
+ }
- // sample source said this has to be at least size of input + 8,
- // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN
- bufferptr out_tmp(in.length()+16);
+ int init(const bufferptr& s, ostringstream& err) {
+ secret = s;
- PK11SlotInfo *slot;
+ enc_key = new CryptoPP::AES::Encryption(
+ (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH);
+ dec_key = new CryptoPP::AES::Decryption(
+ (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH);
- slot = PK11_GetBestSlot(mechanism, NULL);
- if (!slot) {
- ostringstream oss;
- oss << "cannot find NSS slot to use: " << PR_GetError();
- error = oss.str();
- goto err;
+ return 0;
}
- SECItem keyItem;
-
- keyItem.type = siBuffer;
- keyItem.data = (unsigned char*)secret.c_str();
- keyItem.len = secret.length();
-
- PK11SymKey *key;
+ int encrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const {
+ string ciphertext;
+ CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext);
+ CryptoPP::CBC_Mode_ExternalCipher::Encryption cbc(
+ *enc_key, (const byte*)CEPH_AES_IV);
+ CryptoPP::StreamTransformationFilter stfEncryptor(cbc, sink);
- key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
- &keyItem, NULL);
- if (!key) {
- ostringstream oss;
- oss << "cannot convert AES key for NSS: " << PR_GetError();
- error = oss.str();
- goto err_slot;
+ for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
+ it != in.buffers().end(); ++it) {
+ const unsigned char *in_buf = (const unsigned char *)it->c_str();
+ stfEncryptor.Put(in_buf, it->length());
+ }
+ try {
+ stfEncryptor.MessageEnd();
+ } catch (CryptoPP::Exception& e) {
+ if (error) {
+ ostringstream oss;
+ oss << "encryptor.MessageEnd::Exception: " << e.GetWhat();
+ *error = oss.str();
+ }
+ return -1;
+ }
+ out.append((const char *)ciphertext.c_str(), ciphertext.length());
+ return 0;
}
- SECItem ivItem;
-
- ivItem.type = siBuffer;
- // losing constness due to SECItem.data; IV should never be
- // modified, regardless
- ivItem.data = (unsigned char*)CEPH_AES_IV;
- ivItem.len = sizeof(CEPH_AES_IV);
+ int decrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const {
+ string decryptedtext;
+ CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext);
+ CryptoPP::CBC_Mode_ExternalCipher::Decryption cbc(
+ *dec_key, (const byte*)CEPH_AES_IV );
+ CryptoPP::StreamTransformationFilter stfDecryptor(cbc, sink);
+ for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
+ it != in.buffers().end(); ++it) {
+ const unsigned char *in_buf = (const unsigned char *)it->c_str();
+ stfDecryptor.Put(in_buf, it->length());
+ }
- SECItem *param;
+ try {
+ stfDecryptor.MessageEnd();
+ } catch (CryptoPP::Exception& e) {
+ if (error) {
+ ostringstream oss;
+ oss << "decryptor.MessageEnd::Exception: " << e.GetWhat();
+ *error = oss.str();
+ }
+ return -1;
+ }
- param = PK11_ParamFromIV(mechanism, &ivItem);
- if (!param) {
- ostringstream oss;
- oss << "cannot set NSS IV param: " << PR_GetError();
- error = oss.str();
- goto err_key;
+ out.append((const char *)decryptedtext.c_str(), decryptedtext.length());
+ return 0;
}
+};
- PK11Context *ctx;
+#elif USE_NSS
+// when we say AES, we mean AES-128
+# define AES_KEY_LEN 16
+# define AES_BLOCK_LEN 16
- ctx = PK11_CreateContextBySymKey(mechanism, op, key, param);
- if (!ctx) {
- ostringstream oss;
- oss << "cannot create NSS context: " << PR_GetError();
- error = oss.str();
- goto err_param;
- }
+static int nss_aes_operation(CK_ATTRIBUTE_TYPE op,
+ CK_MECHANISM_TYPE mechanism,
+ PK11SymKey *key,
+ SECItem *param,
+ const bufferlist& in, bufferlist& out,
+ std::string *error)
+{
+ // sample source said this has to be at least size of input + 8,
+ // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN
+ bufferptr out_tmp(in.length()+16);
+ bufferlist incopy;
SECStatus ret;
int written;
- // in is const, and PK11_CipherOp is not; C++ makes this hard to cheat,
- // so just copy it to a temp buffer, at least for now
- unsigned in_len;
unsigned char *in_buf;
- in_len = in.length();
- in_buf = (unsigned char*)malloc(in_len);
- if (!in_buf)
- throw std::bad_alloc();
- in.copy(0, in_len, (char*)in_buf);
- ret = PK11_CipherOp(ctx, (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(),
+
+ PK11Context *ectx;
+ ectx = PK11_CreateContextBySymKey(mechanism, op, key, param);
+ assert(ectx);
+
+ incopy = in; // it's a shallow copy!
+ in_buf = (unsigned char*)incopy.c_str();
+ ret = PK11_CipherOp(ectx,
+ (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(),
in_buf, in.length());
- free(in_buf);
if (ret != SECSuccess) {
- ostringstream oss;
- oss << "NSS AES failed: " << PR_GetError();
- error = oss.str();
- goto err_op;
+ PK11_DestroyContext(ectx, PR_TRUE);
+ if (error) {
+ ostringstream oss;
+ oss << "NSS AES failed: " << PR_GetError();
+ *error = oss.str();
+ }
+ return -1;
}
unsigned int written2;
- ret = PK11_DigestFinal(ctx, (unsigned char*)out_tmp.c_str()+written, &written2,
+ ret = PK11_DigestFinal(ectx,
+ (unsigned char*)out_tmp.c_str()+written, &written2,
out_tmp.length()-written);
+ PK11_DestroyContext(ectx, PR_TRUE);
if (ret != SECSuccess) {
- ostringstream oss;
- oss << "NSS AES final round failed: " << PR_GetError();
- error = oss.str();
- goto err_op;
+ if (error) {
+ ostringstream oss;
+ oss << "NSS AES final round failed: " << PR_GetError();
+ *error = oss.str();
+ }
+ return -1;
}
out_tmp.set_length(written + written2);
out.append(out_tmp);
-
- PK11_DestroyContext(ctx, PR_TRUE);
- SECITEM_FreeItem(param, PR_TRUE);
- PK11_FreeSymKey(key);
- PK11_FreeSlot(slot);
- return;
-
- err_op:
- PK11_DestroyContext(ctx, PR_TRUE);
- err_param:
- SECITEM_FreeItem(param, PR_TRUE);
- err_key:
- PK11_FreeSymKey(key);
- err_slot:
- PK11_FreeSlot(slot);
- err:
- ;
+ return 0;
}
+class CryptoAESKeyHandler : public CryptoKeyHandler {
+ CK_MECHANISM_TYPE mechanism;
+ PK11SlotInfo *slot;
+ PK11SymKey *key;
+ SECItem *param;
+
+public:
+ CryptoAESKeyHandler()
+ : mechanism(CKM_AES_CBC_PAD),
+ slot(NULL),
+ key(NULL),
+ param(NULL) {}
+ ~CryptoAESKeyHandler() {
+ SECITEM_FreeItem(param, PR_TRUE);
+ PK11_FreeSymKey(key);
+ PK11_FreeSlot(slot);
+ }
+
+ int init(const bufferptr& s, ostringstream& err) {
+ secret = s;
+
+ slot = PK11_GetBestSlot(mechanism, NULL);
+ if (!slot) {
+ err << "cannot find NSS slot to use: " << PR_GetError();
+ return -1;
+ }
+
+ SECItem keyItem;
+ keyItem.type = siBuffer;
+ keyItem.data = (unsigned char*)secret.c_str();
+ keyItem.len = secret.length();
+ key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
+ &keyItem, NULL);
+ if (!key) {
+ err << "cannot convert AES key for NSS: " << PR_GetError();
+ return -1;
+ }
+
+ SECItem ivItem;
+ ivItem.type = siBuffer;
+ // losing constness due to SECItem.data; IV should never be
+ // modified, regardless
+ ivItem.data = (unsigned char*)CEPH_AES_IV;
+ ivItem.len = sizeof(CEPH_AES_IV);
+
+ param = PK11_ParamFromIV(mechanism, &ivItem);
+ if (!param) {
+ err << "cannot set NSS IV param: " << PR_GetError();
+ return -1;
+ }
+
+ return 0;
+ }
+
+ int encrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const {
+ return nss_aes_operation(CKA_ENCRYPT, mechanism, key, param, in, out, error);
+ }
+ int decrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const {
+ return nss_aes_operation(CKA_DECRYPT, mechanism, key, param, in, out, error);
+ }
+};
+
#else
# error "No supported crypto implementation found."
#endif
+
+
+// ------------------------------------------------------------
+
int CryptoAES::create(bufferptr& secret)
{
bufferlist bl;
return 0;
}
-int CryptoAES::validate_secret(bufferptr& secret)
+int CryptoAES::validate_secret(const bufferptr& secret)
{
if (secret.length() < (size_t)AES_KEY_LEN) {
return -EINVAL;
return 0;
}
-void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlist& out,
- std::string &error) const
+CryptoKeyHandler *CryptoAES::get_key_handler(const bufferptr& secret,
+ string& error)
{
- if (secret.length() < AES_KEY_LEN) {
- error = "key is too short";
- return;
- }
-#ifdef USE_CRYPTOPP
- {
- const unsigned char *key = (const unsigned char *)secret.c_str();
-
- string ciphertext;
- CryptoPP::AES::Encryption aesEncryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH);
- CryptoPP::CBC_Mode_ExternalCipher::Encryption cbcEncryption( aesEncryption, (const byte*)CEPH_AES_IV );
- CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext);
- CryptoPP::StreamTransformationFilter stfEncryptor(cbcEncryption, sink);
-
- for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
- it != in.buffers().end(); ++it) {
- const unsigned char *in_buf = (const unsigned char *)it->c_str();
- stfEncryptor.Put(in_buf, it->length());
- }
- try {
- stfEncryptor.MessageEnd();
- } catch (CryptoPP::Exception& e) {
- ostringstream oss;
- oss << "encryptor.MessageEnd::Exception: " << e.GetWhat();
- error = oss.str();
- return;
- }
- out.append((const char *)ciphertext.c_str(), ciphertext.length());
+ CryptoAESKeyHandler *ckh = new CryptoAESKeyHandler;
+ ostringstream oss;
+ if (ckh->init(secret, oss) < 0) {
+ error = oss.str();
+ return NULL;
}
-#elif USE_NSS
- nss_aes_operation(CKA_ENCRYPT, secret, in, out, error);
-#else
-# error "No supported crypto implementation found."
-#endif
+ return ckh;
}
-void CryptoAES::decrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const
-{
-#ifdef USE_CRYPTOPP
- const unsigned char *key = (const unsigned char *)secret.c_str();
- CryptoPP::AES::Decryption aesDecryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH);
- CryptoPP::CBC_Mode_ExternalCipher::Decryption cbcDecryption( aesDecryption, (const byte*)CEPH_AES_IV );
- string decryptedtext;
- CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext);
- CryptoPP::StreamTransformationFilter stfDecryptor(cbcDecryption, sink);
- for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
- it != in.buffers().end(); ++it) {
- const unsigned char *in_buf = (const unsigned char *)it->c_str();
- stfDecryptor.Put(in_buf, it->length());
- }
- try {
- stfDecryptor.MessageEnd();
- } catch (CryptoPP::Exception& e) {
- ostringstream oss;
- oss << "decryptor.MessageEnd::Exception: " << e.GetWhat();
- error = oss.str();
- return;
- }
-
- out.append((const char *)decryptedtext.c_str(), decryptedtext.length());
-#elif USE_NSS
- nss_aes_operation(CKA_DECRYPT, secret, in, out, error);
-#else
-# error "No supported crypto implementation found."
-#endif
-}
+// --
// ---------------------------------------------------
-int CryptoKey::set_secret(CephContext *cct, int type, bufferptr& s)
-{
- this->type = type;
- created = ceph_clock_now(cct);
- CryptoHandler *h = cct->get_crypto_handler(type);
- if (!h) {
- lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl;
- return -EOPNOTSUPP;
- }
- int ret = h->validate_secret(s);
-
- if (ret < 0)
- return ret;
+void CryptoKey::encode(bufferlist& bl) const
+{
+ ::encode(type, bl);
+ ::encode(created, bl);
+ __u16 len = secret.length();
+ ::encode(len, bl);
+ bl.append(secret);
+}
- secret = s;
+void CryptoKey::decode(bufferlist::iterator& bl)
+{
+ ::decode(type, bl);
+ ::decode(created, bl);
+ __u16 len;
+ ::decode(len, bl);
+ bufferptr tmp;
+ bl.copy(len, tmp);
+ if (_set_secret(type, tmp) < 0)
+ throw buffer::malformed_input("malformed secret");
+}
+int CryptoKey::set_secret(int type, const bufferptr& s, utime_t c)
+{
+ int r = _set_secret(type, s);
+ if (r < 0)
+ return r;
+ this->created = c;
return 0;
}
-int CryptoKey::create(CephContext *cct, int t)
+int CryptoKey::_set_secret(int t, const bufferptr& s)
{
- type = t;
- created = ceph_clock_now(cct);
-
- CryptoHandler *h = cct->get_crypto_handler(type);
- if (!h) {
- lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl;
- return -EOPNOTSUPP;
+ if (s.length() == 0) {
+ secret = s;
+ ckh.reset();
+ return 0;
}
- return h->create(secret);
-}
-void CryptoKey::encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const
-{
- if (!ch || ch->get_type() != type) {
- ch = cct->get_crypto_handler(type);
- if (!ch) {
- ostringstream oss;
- oss << "CryptoKey::encrypt: key type " << type << " not supported.";
- return;
+ CryptoHandler *ch = CryptoHandler::create(t);
+ if (ch) {
+ int ret = ch->validate_secret(s);
+ if (ret < 0) {
+ delete ch;
+ return ret;
+ }
+ string error;
+ ckh.reset(ch->get_key_handler(s, error));
+ delete ch;
+ if (error.length()) {
+ return -EIO;
}
+ } else {
+ return -EOPNOTSUPP;
}
- ch->encrypt(this->secret, in, out, error);
+ type = t;
+ secret = s;
+ return 0;
}
-void CryptoKey::decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const
+int CryptoKey::create(CephContext *cct, int t)
{
- if (!ch || ch->get_type() != type) {
- ch = cct->get_crypto_handler(type);
- if (!ch) {
- ostringstream oss;
- oss << "CryptoKey::decrypt: key type " << type << " not supported.";
- return;
- }
+ CryptoHandler *ch = CryptoHandler::create(t);
+ if (!ch) {
+ if (cct)
+ lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << t << ") returned NULL" << dendl;
+ return -EOPNOTSUPP;
}
- ch->decrypt(this->secret, in, out, error);
+ bufferptr s;
+ int r = ch->create(s);
+ delete ch;
+ if (r < 0)
+ return r;
+
+ r = _set_secret(t, s);
+ if (r < 0)
+ return r;
+ created = ceph_clock_now(cct);
+ return r;
}
void CryptoKey::print(std::ostream &out) const
{
bl.append(encode_base64());
}
+
+
+// ------------------
+
+CryptoHandler *CryptoHandler::create(int type)
+{
+ switch (type) {
+ case CEPH_CRYPTO_NONE:
+ return new CryptoNone;
+ case CEPH_CRYPTO_AES:
+ return new CryptoAES;
+ default:
+ return NULL;
+ }
+}
#include "include/types.h"
#include "include/utime.h"
+#include "include/memory.h"
#include "common/Formatter.h"
#include "include/buffer.h"
class CephContext;
class CryptoHandler;
+class CryptoKeyContext;
+
+/*
+ * some per-key context that is specific to a particular crypto backend
+ */
+class CryptoKeyHandler {
+public:
+ bufferptr secret;
+
+ virtual ~CryptoKeyHandler() {}
+
+ virtual int encrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const = 0;
+ virtual int decrypt(const bufferlist& in,
+ bufferlist& out, std::string *error) const = 0;
+};
/*
* match encoding of struct ceph_secret
protected:
__u16 type;
utime_t created;
- bufferptr secret;
+ bufferptr secret; // must set this via set_secret()!
- // cache a pointer to the handler, so we don't have to look it up
- // for each crypto operation
- mutable CryptoHandler *ch;
+ // cache a pointer to the implementation-specific key handler, so we
+ // don't have to create it for every crypto operation.
+ mutable ceph::shared_ptr<CryptoKeyHandler> ckh;
+
+ int _set_secret(int type, const bufferptr& s);
public:
- CryptoKey() : type(0), ch(NULL) { }
- CryptoKey(int t, utime_t c, bufferptr& s) : type(t), created(c), secret(s), ch(NULL) { }
-
- void encode(bufferlist& bl) const {
- ::encode(type, bl);
- ::encode(created, bl);
- __u16 len = secret.length();
- ::encode(len, bl);
- bl.append(secret);
+ CryptoKey() : type(0) { }
+ CryptoKey(int t, utime_t c, bufferptr& s)
+ : created(c) {
+ _set_secret(t, s);
}
- void decode(bufferlist::iterator& bl) {
- ::decode(type, bl);
- ::decode(created, bl);
- __u16 len;
- ::decode(len, bl);
- bl.copy(len, secret);
- secret.c_str(); // make sure it's a single buffer!
+ ~CryptoKey() {
}
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+
int get_type() const { return type; }
utime_t get_created() const { return created; }
void print(std::ostream& out) const;
- int set_secret(CephContext *cct, int type, bufferptr& s);
- bufferptr& get_secret() { return secret; }
+ int set_secret(int type, const bufferptr& s, utime_t created);
+ const bufferptr& get_secret() { return secret; }
const bufferptr& get_secret() const { return secret; }
void encode_base64(string& s) const {
// --
int create(CephContext *cct, int type);
- void encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const;
- void decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const;
+ int encrypt(CephContext *cct, const bufferlist& in, bufferlist& out,
+ std::string *error) const {
+ return ckh->encrypt(in, out, error);
+ }
+ int decrypt(CephContext *cct, const bufferlist& in, bufferlist& out,
+ std::string *error) const {
+ return ckh->decrypt(in, out, error);
+ }
void to_str(std::string& s) const;
};
virtual ~CryptoHandler() {}
virtual int get_type() const = 0;
virtual int create(bufferptr& secret) = 0;
- virtual int validate_secret(bufferptr& secret) = 0;
- virtual void encrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const = 0;
- virtual void decrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const = 0;
+ virtual int validate_secret(const bufferptr& secret) = 0;
+ virtual CryptoKeyHandler *get_key_handler(const bufferptr& secret,
+ string& error) = 0;
+
+ static CryptoHandler *create(int type);
};
extern int get_random_bytes(char *buf, int len);
extern uint64_t get_random(uint64_t min_val, uint64_t max_val);
-class CryptoNone : public CryptoHandler {
-public:
- CryptoNone() { }
- ~CryptoNone() {}
- int get_type() const {
- return CEPH_CRYPTO_NONE;
- }
- int create(bufferptr& secret);
- int validate_secret(bufferptr& secret);
- void encrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const;
- void decrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const;
-};
-
-class CryptoAES : public CryptoHandler {
-public:
- CryptoAES() { }
- ~CryptoAES() {}
- int get_type() const {
- return CEPH_CRYPTO_AES;
- }
- int create(bufferptr& secret);
- int validate_secret(bufferptr& secret);
- void encrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const;
- void decrypt(const bufferptr& secret, const bufferlist& in,
- bufferlist& out, std::string &error) const;
-};
-
#endif
if (crypto->create(bp) < 0)
return false;
- secret.set_secret(cct, CEPH_CRYPTO_AES, bp);
+ secret.set_secret(CEPH_CRYPTO_AES, bp, ceph_clock_now(NULL));
return true;
}
uint64_t magic;
bufferlist bl;
- key.decrypt(cct, bl_enc, bl, error);
- if (!error.empty())
+ if (key.decrypt(cct, bl_enc, bl, &error) < 0)
return;
bufferlist::iterator iter2 = bl.begin();
::encode(magic, bl);
::encode(t, bl);
- key.encrypt(cct, bl, out, error);
+ key.encrypt(cct, bl, out, &error);
}
template <typename T>
#define dout_subsys ceph_subsys_auth
+int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig)
+{
+ const ceph_msg_header& header = m->get_header();
+ const ceph_msg_footer& footer = m->get_footer();
+
+ // optimized signature calculation
+ // - avoid temporary allocated buffers from encode_encrypt[_enc_bl]
+ // - skip the leading 4 byte wrapper from encode_encrypt
+ struct {
+ __u8 v;
+ __le64 magic;
+ __le32 len;
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 middle_crc;
+ __le32 data_crc;
+ } __attribute__ ((packed)) sigblock = {
+ 1, AUTH_ENC_MAGIC, 4*4,
+ header.crc, footer.front_crc, footer.middle_crc, footer.data_crc
+ };
+ bufferlist bl_plaintext;
+ bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock));
+
+ bufferlist bl_ciphertext;
+ if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
+ lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
+ return -1;
+ }
+
+ bufferlist::iterator ci = bl_ciphertext.begin();
+ ::decode(*psig, ci);
+
+ ldout(cct, 10) << __func__ << " seq " << m->get_seq()
+ << " front_crc_ = " << footer.front_crc
+ << " middle_crc = " << footer.middle_crc
+ << " data_crc = " << footer.data_crc
+ << " sig = " << *psig
+ << dendl;
+ return 0;
+}
+
int CephxSessionHandler::sign_message(Message *m)
{
// If runtime signing option is off, just return success without signing.
if (!cct->_conf->cephx_sign_messages) {
return 0;
}
- bufferlist bl_plaintext, bl_encrypted;
- ceph_msg_header header = m->get_header();
- std::string error;
-
- ceph_msg_footer& en_footer = m->get_footer();
-
- ::encode(header.crc, bl_plaintext);
- ::encode(en_footer.front_crc, bl_plaintext);
- ::encode(en_footer.middle_crc, bl_plaintext);
- ::encode(en_footer.data_crc, bl_plaintext);
-
- ldout(cct, 10) << "sign_message: seq # " << header.seq << " CRCs are: header " << header.crc
- << " front " << en_footer.front_crc << " middle " << en_footer.middle_crc
- << " data " << en_footer.data_crc << dendl;
-
- if (encode_encrypt(cct, bl_plaintext, key, bl_encrypted, error)) {
- ldout(cct, 0) << "error encrypting message signature: " << error << dendl;
- ldout(cct, 0) << "no signature put on message" << dendl;
- return SESSION_SIGNATURE_FAILURE;
- }
-
- bufferlist::iterator ci = bl_encrypted.begin();
- // Skip the magic number up front. PLR
- ci.advance(4);
- ::decode(en_footer.sig, ci);
-
- // There's potentially an issue with whether the encoding and decoding done here will work
- // properly when a big endian and little endian machine are talking. We think it's OK,
- // but it should be tested to be sure. PLR
-
- // Receiver won't trust this flag to decide if msg should have been signed. It's primarily
- // to debug problems where sender and receiver disagree on need to sign msg. PLR
- en_footer.flags = (unsigned)en_footer.flags | CEPH_MSG_FOOTER_SIGNED;
+
+ uint64_t sig;
+ int r = _calc_signature(m, &sig);
+ if (r < 0)
+ return r;
+
+ ceph_msg_footer& f = m->get_footer();
+ f.sig = sig;
+ f.flags = (unsigned)f.flags | CEPH_MSG_FOOTER_SIGNED;
messages_signed++;
- ldout(cct, 20) << "Putting signature in client message(seq # " << header.seq << "): sig = " << en_footer.sig << dendl;
+ ldout(cct, 20) << "Putting signature in client message(seq # " << m->get_seq()
+ << "): sig = " << sig << dendl;
return 0;
}
if (!cct->_conf->cephx_sign_messages) {
return 0;
}
-
- bufferlist bl_plaintext, bl_ciphertext;
- std::string sig_error;
- ceph_msg_header& header = m->get_header();
- ceph_msg_footer& footer = m->get_footer();
-
if ((features & CEPH_FEATURE_MSG_AUTH) == 0) {
// it's fine, we didn't negotiate this feature.
return 0;
}
- signatures_checked++;
+ uint64_t sig;
+ int r = _calc_signature(m, &sig);
+ if (r < 0)
+ return r;
- ldout(cct, 10) << "check_message_signature: seq # = " << m->get_seq() << " front_crc_ = " << footer.front_crc
- << " middle_crc = " << footer.middle_crc << " data_crc = " << footer.data_crc << dendl;
- ::encode(header.crc, bl_plaintext);
- ::encode(footer.front_crc, bl_plaintext);
- ::encode(footer.middle_crc, bl_plaintext);
- ::encode(footer.data_crc, bl_plaintext);
-
- // Encrypt the buffer containing the checksums to calculate the signature. PLR
- if (encode_encrypt(cct, bl_plaintext, key, bl_ciphertext, sig_error)) {
- ldout(cct, 0) << "error in encryption for checking message signature: " << sig_error << dendl;
- return (SESSION_SIGNATURE_FAILURE);
- }
-
- bufferlist::iterator ci = bl_ciphertext.begin();
- // Skip the magic number at the front. PLR
- ci.advance(4);
- uint64_t sig_check;
- ::decode(sig_check, ci);
-
- // There's potentially an issue with whether the encoding and decoding done here will work
- // properly when a big endian and little endian machine are talking. We think it's OK,
- // but it should be tested to be sure. PLR
+ signatures_checked++;
- if (sig_check != footer.sig) {
+ if (sig != m->get_footer().sig) {
// Should have been signed, but signature check failed. PLR
- if (!(footer.flags & CEPH_MSG_FOOTER_SIGNED)) {
- ldout(cct, 0) << "SIGN: MSG " << header.seq << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl;
+ if (!(m->get_footer().flags & CEPH_MSG_FOOTER_SIGNED)) {
+ ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl;
}
- ldout(cct, 0) << "SIGN: MSG " << header.seq << " Message signature does not match contents." << dendl;
- ldout(cct, 0) << "SIGN: MSG " << header.seq << "Signature on message:" << dendl;
- ldout(cct, 0) << "SIGN: MSG " << header.seq << " sig: " << footer.sig << dendl;
- ldout(cct, 0) << "SIGN: MSG " << header.seq << "Locally calculated signature:" << dendl;
- ldout(cct, 0) << "SIGN: MSG " << header.seq << " sig_check:" << sig_check << dendl;
-
- // For the moment, printing an error message to the log and returning failure is sufficient.
- // In the long term, we should probably have code parsing the log looking for this kind
- // of security failure, particularly when there are large numbers of them, since the latter
- // is a potential sign of an attack. PLR
+ ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Message signature does not match contents." << dendl;
+ ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Signature on message:" << dendl;
+ ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " sig: " << m->get_footer().sig << dendl;
+ ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Locally calculated signature:" << dendl;
+ ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " sig_check:" << sig << dendl;
+
+ // For the moment, printing an error message to the log and
+ // returning failure is sufficient. In the long term, we should
+ // probably have code parsing the log looking for this kind of
+ // security failure, particularly when there are large numbers of
+ // them, since the latter is a potential sign of an attack. PLR
signatures_failed++;
ldout(cct, 0) << "Signature failed." << dendl;
return false;
}
- int sign_message(Message *m);
+ int _calc_signature(Message *m, uint64_t *psig);
+ int sign_message(Message *m);
int check_message_signature(Message *m) ;
// Cephx does not currently encrypt messages, so just return 0 if called. PLR
this is what the journal symlink inside the osd data volume normally
points to.
-activate-all relies on /dev/disk/by-parttype-uuid/$typeuuid.$uuid to
+activate-all relies on /dev/disk/by-parttypeuuid/$typeuuid.$uuid to
find all partitions. We install special udev rules to create these
links.
def get_partition_uuid(dev):
+ #
+ # blkid is prefered
+ #
+ what = 'ID_PART_ENTRY_UUID'
+ out, _ = command(
+ [
+ 'blkid',
+ '-o',
+ 'udev',
+ '-p',
+ dev,
+ ]
+ )
+ p = {}
+ for line in out.splitlines():
+ (key, value) = line.split('=')
+ p[key] = value
+ if what in p:
+ return p[what]
+ #
+ # if blkid does not deliver, fallback to sgdisk
+ #
(base, partnum) = split_dev_base_partnum(dev)
out, _ = command(['sgdisk', '-i', partnum, base])
for line in out.splitlines():
print >> sys.stderr, \
'error handling command target: {0}'.format(e)
return 1, '', ''
+ if len(cmdargs) and cmdargs[0] == 'tell':
+ print >> sys.stderr, \
+ 'Can not use \'tell\' in interactive mode.'
+ continue
valid_dict = validate_command(sigdict, cmdargs, verbose)
if valid_dict:
if parsed_args.output_format:
#include "common/Timer.h"
#include "common/ceph_argparse.h"
#include "global/global_init.h"
+#include "global/signal_handler.h"
#include "common/safe_io.h"
#ifndef DARWIN
goto out_client_unmount;
}
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+
cerr << "ceph-fuse[" << getpid() << "]: starting fuse" << std::endl;
tester.init(cfuse, client);
tester.create();
free(newargv);
delete mc;
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ shutdown_async_signal_handler();
//cout << "child done" << std::endl;
return r;
#include "include/assert.h"
#include "include/stat.h"
+#if HAVE_GETGROUPLIST
+#include <grp.h>
+#include <pwd.h>
+#endif
+
#undef dout_prefix
#define dout_prefix *_dout << "client." << whoami << " "
: inode(in), offset(0), this_offset(2), next_offset(2),
release_count(0), ordered_count(0), start_shared_gen(0),
buffer(0) {
- inode->get();
}
// cons/des
last_tid = 0;
last_flush_seq = 0;
- cwd = NULL;
-
//
root = 0;
delete root;
root = 0;
root_ancestor = 0;
- while (!root_parents.empty()) {
- Inode *in = root_parents.begin()->second;
+ while (!root_parents.empty())
root_parents.erase(root_parents.begin());
- delete in;
- }
inode_map.clear();
}
f->close_section();
}
if (it->second->inode)
- dump_inode(f, it->second->inode, did, false);
+ dump_inode(f, it->second->inode.get(), did, false);
}
}
}
delete root;
root = 0;
root_ancestor = 0;
- while (!root_parents.empty()) {
- Inode *in = root_parents.begin()->second;
+ while (!root_parents.empty())
root_parents.erase(root_parents.begin());
- delete in;
- }
inode_map.clear();
}
}
in = inode_map[st->vino];
ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
} else {
- in = new Inode(cct, st->vino, &st->layout);
+ in = new Inode(this, st->vino, &st->layout);
inode_map[st->vino] = in;
if (!root) {
root = in;
root_ancestor = in;
cwd = root;
- cwd->get();
} else if (!mounted) {
root_parents[root_ancestor] = in;
root_ancestor = in;
- in->get();
}
// immutable bits
}
}
- if (!dn || dn->inode == 0) {
- in->get();
+ if (!dn || !dn->inode) {
+ InodeRef tmp_ref(in);
if (old_dentry) {
if (old_dentry->dir != dir) {
old_dentry->dir->ordered_count++;
dir->parent_inode->flags &= ~I_DIR_ORDERED;
}
dn = link(dir, dname, in, dn);
- put_inode(in);
}
update_dentry_lease(dn, dlease, from, session);
dn->offset = dir_result_t::make_fpos(fg, i + readdir_offset);
// add to cached result list
- in->get();
- request->readdir_result.push_back(pair<string,Inode*>(dname, in));
+ request->readdir_result.push_back(pair<string,InodeRef>(dname, in));
ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
}
}
} else if (de) {
if (de->inode) {
- in = de->inode;
+ in = de->inode.get();
ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
} else {
in = de->dir->parent_inode;
ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
while (in->snapid != CEPH_NOSNAP) {
if (in->snapid == CEPH_SNAPDIR)
- in = in->snapdir_parent;
+ in = in->snapdir_parent.get();
else if (!in->dn_set.empty())
/* In most cases there will only be one dentry, so getting it
* will be the correct action. If there are multiple hard links,
* I think the MDS should be able to redirect as needed*/
- in = in->get_first_parent()->dir->parent_inode;
+ in = in->get_first_parent()->dir->parent_inode;
else {
ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
break;
int Client::verify_reply_trace(int r,
MetaRequest *request, MClientReply *reply,
- Inode **ptarget, bool *pcreated,
+ InodeRef *ptarget, bool *pcreated,
int uid, int gid)
{
// check whether this request actually did the create, and set created flag
*pcreated = got_created_ino;
if (request->target) {
- *ptarget = request->target;
- ldout(cct, 20) << "make_request target is " << *request->target << dendl;
+ ptarget->swap(request->target);
+ ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
} else {
if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
(*ptarget) = p->second;
- ldout(cct, 20) << "make_request created, target is " << **ptarget << dendl;
+ ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
} else {
// we got a traceless reply, and need to look up what we just
// created. for now, do this by name. someday, do this by the
// ino... which we know! FIXME.
- Inode *target = 0; // ptarget may be NULL
+ InodeRef target;
Dentry *d = request->dentry();
if (d) {
if (d->dir) {
target = in;
}
if (r >= 0) {
- if (ptarget)
- *ptarget = target;
-
// verify ino returned in reply and trace_dist are the same
if (got_created_ino &&
created_ino.val != target->ino.val) {
ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
r = -EINTR;
}
+ if (ptarget)
+ ptarget->swap(target);
}
}
}
*/
int Client::make_request(MetaRequest *request,
int uid, int gid,
- Inode **ptarget, bool *pcreated,
+ InodeRef *ptarget, bool *pcreated,
int use_mds,
bufferlist *pdirbl)
{
void Client::put_request(MetaRequest *request)
{
- if (request->_put()) {
- if (request->inode())
- put_inode(request->take_inode());
- if (request->old_inode())
- put_inode(request->take_old_inode());
- if (request->other_inode())
- put_inode(request->take_other_inode());
+ if (request->_put())
delete request;
- }
}
int Client::encode_inode_release(Inode *in, MetaRequest *req,
bool unclean = objectcacher->release_set(&in->oset);
assert(!unclean);
put_qtree(in);
- if (in->snapdir_parent)
- put_inode(in->snapdir_parent);
inode_map.erase(in->vino());
in->cap_item.remove_myself();
in->snaprealm_item.remove_myself();
+ in->snapdir_parent.reset();
if (in == root) {
root = 0;
root_ancestor = 0;
- while (!root_parents.empty()) {
- Inode *in = root_parents.begin()->second;
+ while (!root_parents.empty())
root_parents.erase(root_parents.begin());
- put_inode(in);
- }
}
if (!in->oset.objects.empty()) {
if (in) { // link to inode
dn->inode = in;
- in->get();
if (in->is_dir()) {
if (in->dir)
dn->get(); // dir -> dn pin
void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
{
- Inode *in = dn->inode;
+ InodeRef in;
+ in.swap(dn->inode);
ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
<< " inode " << dn->inode << dendl;
// unlink from inode
if (in) {
- invalidate_quota_tree(in);
+ invalidate_quota_tree(in.get());
if (in->is_dir()) {
if (in->dir)
dn->put(); // dir -> dn pin
assert(in->dn_set.count(dn));
in->dn_set.erase(dn);
ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
- put_inode(in);
}
if (keepdentry) {
}
}
-
/****
* caps
*/
} else if (in->caps_dirty() ||
(used & CEPH_CAP_FILE_WR) ||
(dirty & CEPH_CAP_ANY_WR)) {
- in->get();
CapSnap *capsnap = new CapSnap(in);
in->cap_snaps[seq] = capsnap;
capsnap->context = in->snaprealm->get_snap_context();
class C_Client_CacheInvalidate : public Context {
private:
Client *client;
- Inode *inode;
+ InodeRef inode;
int64_t offset, length;
bool keep_caps;
public:
C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len, bool keep) :
client(c), inode(in), offset(off), length(len), keep_caps(keep) {
- inode->get();
}
void finish(int r) {
// _async_invalidate takes the lock when it needs to, call this back from outside of lock.
}
};
-void Client::_async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps)
+void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps)
{
ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << dendl;
ino_invalidate_cb(callback_handle, in->vino(), off, len);
client_lock.Lock();
if (!keep_caps)
- check_caps(in, false);
- put_inode(in);
+ check_caps(in.get(), false);
+ in.reset(); // put inode inside client_lock
client_lock.Unlock();
ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << " done" << dendl;
}
void Client::trim_caps(MetaSession *s, int max)
{
mds_rank_t mds = s->mds_num;
- ldout(cct, 10) << "trim_caps mds." << mds << " max " << max << dendl;
+ int caps_size = s->caps.size();
+ ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
+ << " caps " << caps_size << dendl;
int trimmed = 0;
xlist<Cap*>::iterator p = s->caps.begin();
- while ((s->caps.size() - trimmed) > max && !p.end()) {
+ while ((caps_size - trimmed) > max && !p.end()) {
Cap *cap = *p;
s->s_cap_iterator = cap;
Inode *in = cap->inode;
ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
bool all = true;
set<Dentry*>::iterator q = in->dn_set.begin();
- in->get();
+ InodeRef tmp_ref(in);
while (q != in->dn_set.end()) {
Dentry *dn = *q++;
if (dn->lru_is_expireable()) {
ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
trimmed++;
}
-
- put_inode(in);
}
++p;
for (xlist<CapSnap*>::iterator p = session->flushing_capsnaps.begin(); !p.end(); ++p) {
CapSnap *capsnap = *p;
- Inode *in = capsnap->in;
+ InodeRef& in = capsnap->in;
ldout(cct, 20) << " reflushing capsnap " << capsnap
<< " on " << *in << " to mds." << mds << dendl;
- flush_snaps(in, false, capsnap);
+ flush_snaps(in.get(), false, capsnap);
}
for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
Inode *in = *p;
} else {
ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
<< " on " << *in << dendl;
+ in->cap_snaps.erase(follows);
capsnap->flushing_item.remove_myself();
delete capsnap;
- in->cap_snaps.erase(follows);
- put_inode(in);
}
} else {
ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
class C_Client_FlushComplete : public Context {
private:
Client *client;
- Inode *inode;
+ InodeRef inode;
public:
- C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in)
- {
- inode->get();
- }
+ C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
void finish(int r) {
assert(client->client_lock.is_locked_by_me());
<< ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
inode->async_err = r;
}
- client->put_inode(inode);
}
};
gid_t *sgids = NULL;
int sgid_count = 0;
if (getgroups_cb) {
- sgid_count = getgroups_cb(callback_handle, uid, &sgids);
- if (sgid_count < 0) {
+ sgid_count = getgroups_cb(callback_handle, &sgids);
+ if (sgid_count > 0) {
ldout(cct, 3) << "getgroups failed!" << dendl;
- return sgid_count;
}
}
+#if HAVE_GETGROUPLIST
+ if (sgid_count <= 0) {
+ // use PAM to get the group list
+ // initial number of group entries, defaults to posix standard of 16
+ // PAM implementations may provide more than 16 groups....
+ sgid_count = 16;
+ sgids = (gid_t*)malloc(sgid_count * sizeof(gid_t));
+ if (sgids == NULL) {
+ ldout(cct, 3) << "allocating group memory failed" << dendl;
+ return -EACCES;
+ }
+ struct passwd *pw;
+ pw = getpwuid(uid);
+ if (pw == NULL) {
+ ldout(cct, 3) << "getting user entry failed" << dendl;
+ return -EACCES;
+ }
+ while (1) {
+ if (getgrouplist(pw->pw_name, gid, sgids, &sgid_count) == -1) {
+ // we need to resize the group list and try again
+ void *_realloc = NULL;
+ if ((_realloc = realloc(sgids, sgid_count * sizeof(gid_t))) == NULL) {
+ ldout(cct, 3) << "allocating group memory failed" << dendl;
+ free(sgids);
+ return -EACCES;
+ }
+ sgids = (gid_t*)_realloc;
+ continue;
+ }
+ // list was successfully retrieved
+ break;
+ }
+ }
+#endif
+
// check permissions before doing anything else
+ int ret = 0;
if (uid != 0 && !in->check_mode(uid, gid, sgids, sgid_count, flags)) {
- return -EACCES;
+ ret = -EACCES;
}
- return 0;
+ if (sgids)
+ free(sgids);
+ return ret;
}
vinodeno_t Client::_get_vino(Inode *in)
timer.cancel_event(tick_event);
tick_event = 0;
- if (cwd)
- put_inode(cwd);
- cwd = NULL;
+ cwd.reset();
// clean up any unclosed files
while (!fd_map.empty()) {
assert(in);
}
if (!in->caps.empty()) {
- in->get();
+ InodeRef tmp_ref(in);
_release(in);
_flush(in, new C_Client_FlushComplete(this, in));
- put_inode(in);
}
}
}
// ===============================================================
// high level (POSIXy) interface
-int Client::_do_lookup(Inode *dir, const string& name, Inode **target)
+int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target)
{
int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
MetaRequest *req = new MetaRequest(op);
return r;
}
-int Client::_lookup(Inode *dir, const string& dname, Inode **target)
+int Client::_lookup(Inode *dir, const string& dname, InodeRef *target)
{
int r = 0;
Dentry *dn = NULL;
return 0;
}
-int Client::path_walk(const filepath& origpath, Inode **final, bool followsym)
+int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym)
{
filepath path = origpath;
- Inode *cur;
+ InodeRef cur;
if (origpath.absolute())
cur = root;
else
const string &dname = path[i];
ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
ldout(cct, 20) << " (path is " << path << ")" << dendl;
- Inode *next;
- int r = _lookup(cur, dname, &next);
+ InodeRef next;
+ int r = _lookup(cur.get(), dname, &next);
if (r < 0)
return r;
// only follow trailing symlink if followsym. always follow
continue;
}
}
- cur = next;
+ cur.swap(next);
i++;
}
if (!cur)
return -ENOENT;
- if (final)
- *final = cur;
+ if (end)
+ end->swap(cur);
return 0;
}
string name = path.last_dentry();
path.pop_dentry();
- Inode *in, *dir;
+ InodeRef in, dir;
int r;
r = path_walk(existing, &in);
if (r < 0)
goto out;
- in->get();
r = path_walk(path, &dir);
if (r < 0)
- goto out_unlock;
- r = _link(in, dir, name.c_str());
- out_unlock:
- put_inode(in);
+ goto out;
+ r = _link(in.get(), dir.get(), name.c_str());
out:
return r;
}
filepath path(relpath);
string name = path.last_dentry();
path.pop_dentry();
- Inode *dir;
+ InodeRef dir;
int r = path_walk(path, &dir);
if (r < 0)
return r;
- return _unlink(dir, name.c_str());
+ return _unlink(dir.get(), name.c_str());
}
int Client::rename(const char *relfrom, const char *relto)
string toname = to.last_dentry();
to.pop_dentry();
- Inode *fromdir, *todir;
+ InodeRef fromdir, todir;
int r;
r = path_walk(from, &fromdir);
if (r < 0)
goto out;
- fromdir->get();
r = path_walk(to, &todir);
if (r < 0)
- goto out_unlock;
- todir->get();
- r = _rename(fromdir, fromname.c_str(), todir, toname.c_str());
- put_inode(todir);
- out_unlock:
- put_inode(fromdir);
+ goto out;
+ r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str());
out:
return r;
}
filepath path(relpath);
string name = path.last_dentry();
path.pop_dentry();
- Inode *dir;
+ InodeRef dir;
int r = path_walk(path, &dir);
if (r < 0) {
return r;
}
- return _mkdir(dir, name.c_str(), mode);
+ return _mkdir(dir.get(), name.c_str(), mode);
}
int Client::mkdirs(const char *relpath, mode_t mode)
filepath path(relpath);
unsigned int i;
int r=0;
- Inode *cur = cwd;
- Inode *next;
+ InodeRef cur, next;
+ cur = cwd;
for (i=0; i<path.depth(); ++i) {
- r=_lookup(cur, path[i].c_str(), &next);
+ r=_lookup(cur.get(), path[i].c_str(), &next);
if (r < 0) break;
- cur = next;
+ cur.swap(next);
}
//check that we have work left to do
if (i==path.depth()) return -EEXIST;
//make new directory at each level
for (; i<path.depth(); ++i) {
//make new dir
- r = _mkdir(cur, path[i].c_str(), mode);
+ r = _mkdir(cur.get(), path[i].c_str(), mode);
//check proper creation/existence
if (r < 0) return r;
- r = _lookup(cur, path[i], &next);
+ r = _lookup(cur.get(), path[i], &next);
if(r < 0) {
ldout(cct, 0) << "mkdirs: successfully created new directory " << path[i]
<< " but can't _lookup it!" << dendl;
return r;
}
//move to new dir and continue
- cur = next;
+ cur.swap(next);
ldout(cct, 20) << "mkdirs: successfully created directory "
<< filepath(cur->ino).get_path() << dendl;
}
filepath path(relpath);
string name = path.last_dentry();
path.pop_dentry();
- Inode *dir;
+ InodeRef dir;
int r = path_walk(path, &dir);
if (r < 0)
return r;
- return _rmdir(dir, name.c_str());
+ return _rmdir(dir.get(), name.c_str());
}
int Client::mknod(const char *relpath, mode_t mode, dev_t rdev)
filepath path(relpath);
string name = path.last_dentry();
path.pop_dentry();
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
- return _mknod(in, name.c_str(), mode, rdev);
+ return _mknod(in.get(), name.c_str(), mode, rdev);
}
// symlinks
filepath path(relpath);
string name = path.last_dentry();
path.pop_dentry();
- Inode *dir;
+ InodeRef dir;
int r = path_walk(path, &dir);
if (r < 0)
return r;
- return _symlink(dir, name.c_str(), target);
+ return _symlink(dir.get(), name.c_str(), target);
}
int Client::readlink(const char *relpath, char *buf, loff_t size)
tout(cct) << relpath << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in, false);
if (r < 0)
return r;
- return _readlink(in, buf, size);
+ return _readlink(in.get(), buf, size);
}
int Client::_readlink(Inode *in, char *buf, size_t size)
}
int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
- Inode **inp)
+ InodeRef *inp)
{
int issued = in->caps_issued();
tout(cct) << mask << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
- return _setattr(in, attr, mask);
+ return _setattr(in, attr, mask);
}
int Client::fsetattr(int fd, struct stat *attr, int mask)
if (f->flags & O_PATH)
return -EBADF;
#endif
- return _setattr(f->inode, attr, mask);
+ return _setattr(f->inode, attr, mask);
}
int Client::stat(const char *relpath, struct stat *stbuf,
tout(cct) << "stat" << std::endl;
tout(cct) << relpath << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
tout(cct) << "lstat" << std::endl;
tout(cct) << relpath << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
// don't follow symlinks
int r = path_walk(path, &in, false);
if (r < 0)
tout(cct) << relpath << std::endl;
tout(cct) << mode << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
tout(cct) << relpath << std::endl;
tout(cct) << mode << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
// don't follow symlinks
int r = path_walk(path, &in, false);
if (r < 0)
tout(cct) << uid << std::endl;
tout(cct) << gid << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
tout(cct) << uid << std::endl;
tout(cct) << gid << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
// don't follow symlinks
int r = path_walk(path, &in, false);
if (r < 0)
tout(cct) << buf->modtime << std::endl;
tout(cct) << buf->actime << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
tout(cct) << buf->modtime << std::endl;
tout(cct) << buf->actime << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
// don't follow symlinks
int r = path_walk(path, &in, false);
if (r < 0)
tout(cct) << "opendir" << std::endl;
tout(cct) << relpath << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
- r = _opendir(in, dirpp);
+ r = _opendir(in.get(), dirpp);
tout(cct) << (unsigned long)*dirpp << std::endl;
return r;
}
ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
if (dirp->inode) {
ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
- put_inode(dirp->inode);
- dirp->inode = 0;
+ dirp->inode.reset();
}
_readdir_drop_dirp_buffer(dirp);
delete dirp;
{
ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
if (dirp->buffer) {
- for (unsigned i = 0; i < dirp->buffer->size(); i++)
- put_inode((*dirp->buffer)[i].second);
delete dirp->buffer;
dirp->buffer = NULL;
}
if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
op = CEPH_MDS_OP_LSSNAP;
- Inode *diri = dirp->inode;
+ InodeRef& diri = dirp->inode;
MetaRequest *req = new MetaRequest(op);
filepath path;
diri->make_nosnap_relative_path(path);
req->set_filepath(path);
- req->set_inode(diri);
+ req->set_inode(diri.get());
req->head.args.readdir.frag = fg;
if (dirp->last_name.length()) {
req->path2.set_path(dirp->last_name.c_str());
_readdir_drop_dirp_buffer(dirp);
- dirp->buffer = new vector<pair<string,Inode*> >;
+ dirp->buffer = new vector<pair<string,InodeRef> >;
dirp->buffer->swap(req->readdir_result);
if (fg != req->readdir_reply_frag) {
struct stat st;
struct dirent de;
- int stmask = fill_stat(dn->inode, &st);
+ int stmask = fill_stat(dn->inode, &st);
fill_dirent(&de, dn->name.c_str(), st.st_mode, st.st_ino, dirp->offset + 1);
uint64_t next_off = dn->offset + 1;
frag_t fg = dirp->frag();
uint32_t off = dirp->fragpos();
- Inode *diri = dirp->inode;
+ InodeRef& diri = dirp->inode;
if (dirp->at_end())
return 0;
if (dirp->offset == 1) {
ldout(cct, 15) << " including .." << dendl;
if (!diri->dn_set.empty()) {
- Inode* in = diri->get_first_parent()->inode;
+ InodeRef& in = diri->get_first_parent()->inode;
fill_dirent(&de, "..", S_IFDIR, in->ino, 2);
fill_stat(in, &st);
} else {
dirp->offset = dir_result_t::make_fpos(fg, off);
while (off >= dirp->this_offset &&
off - dirp->this_offset < dirp->buffer->size()) {
- pair<string,Inode*>& ent = (*dirp->buffer)[off - dirp->this_offset];
+ pair<string,InodeRef>& ent = (*dirp->buffer)[off - dirp->this_offset];
- int stmask = fill_stat(ent.second, &st);
+ int stmask = fill_stat(ent.second, &st);
fill_dirent(&de, ent.first.c_str(), st.st_mode, st.st_ino, dirp->offset + 1);
client_lock.Unlock();
#endif
filepath path(relpath);
- Inode *in;
+ InodeRef in;
bool created = false;
/* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
filepath dirpath = path;
string dname = dirpath.last_dentry();
dirpath.pop_dentry();
- Inode *dir;
+ InodeRef dir;
r = path_walk(dirpath, &dir);
if (r < 0)
return r;
- r = _create(dir, dname.c_str(), flags, mode, &in, &fh, stripe_unit,
+ r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
stripe_count, object_size, data_pool, &created);
}
if (r < 0)
// posix says we can only check permissions of existing files
uid_t uid = geteuid();
gid_t gid = getegid();
- r = check_permissions(in, flags, uid, gid);
+ r = check_permissions(in.get(), flags, uid, gid);
if (r < 0)
goto out;
}
if (!fh)
- r = _open(in, flags, mode, &fh);
+ r = _open(in.get(), flags, mode, &fh);
if (r >= 0) {
// allocate a integer file descriptor
assert(fh);
- assert(in);
r = get_fd();
assert(fd_map.count(r) == 0);
fd_map[r] = fh;
req->set_filepath(path);
req->set_inode(ino);
- int r = make_request(req, -1, -1, NULL, NULL, rand() % mdsmap->get_num_in_mds());
+ InodeRef target;
+ int r = make_request(req, -1, -1, &target, NULL, rand() % mdsmap->get_num_in_mds());
// Give caller a reference to the parent ino if they provided a pointer.
if (parent != NULL) {
if (r == 0) {
- *parent = req->target;
+ *parent = target.get();
_ll_get(*parent);
ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
} else {
// inode
assert(in);
f->inode = in;
- f->inode->get();
ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
{
//ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
//ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
if (in->snapid == CEPH_NOSNAP) {
void Client::_put_fh(Fh *f)
{
int left = f->put();
- if (!left) {
- put_inode(f->inode);
+ if (!left)
delete f;
- }
}
int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid)
loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
{
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
int r;
switch (whence) {
int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
{
const md_config_t *conf = cct->_conf;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
//bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
void Client::C_Readahead::finish(int r) {
lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
- client->put_cap_ref(f->inode, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
+ client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
f->readahead.dec_pending();
}
int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
{
const md_config_t *conf = cct->_conf;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
bool *checkeof)
{
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
uint64_t pos = off;
int left = len;
int read = 0;
*/
class C_Client_SyncCommit : public Context {
Client *cl;
- Inode *in;
+ InodeRef in;
public:
- C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) {
- in->get();
- }
+ C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) {}
void finish(int) {
// Called back by Filter, then Client is responsible for taking its own lock
assert(!cl->client_lock.is_locked_by_me());
}
};
-void Client::sync_write_commit(Inode *in)
+void Client::sync_write_commit(InodeRef& in)
{
Mutex::Locker l(client_lock);
assert(unsafe_sync_write > 0);
unsafe_sync_write--;
- put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
+ put_cap_ref(in.get(), CEPH_CAP_FILE_BUFFER);
ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
if (unsafe_sync_write == 0 && unmounting) {
mount_cond.Signal();
}
- put_inode(in);
+ in.reset(); // put inode inside client_lock
}
int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
}
//ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
assert(in->snapid == CEPH_NOSNAP);
int Client::_flush(Fh *f)
{
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
int err = in->async_err;
if (err != 0) {
ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
int Client::_fsync(Fh *f, bool syncdataonly)
{
int r = 0;
-
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
ceph_tid_t wait_on_flush = 0;
bool flushed_metadata = false;
Mutex lock("Client::_fsync::lock");
Cond cond;
bool done = false;
C_SafeCond *object_cacher_completion = NULL;
+ InodeRef tmp_ref;
ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
if (cct->_conf->client_oc) {
object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
- in->get(); // take a reference; C_SafeCond doesn't and _flush won't either
+ tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
_flush(in, object_cacher_completion);
ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
}
cond.Wait(lock);
lock.Unlock();
client_lock.Lock();
- put_inode(in);
ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
} else {
// FIXME: this can starve
tout(cct) << "chdir" << std::endl;
tout(cct) << relpath << std::endl;
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
- if (cwd != in) {
- in->get();
- put_inode(cwd);
- cwd = in;
- }
+ if (cwd != in)
+ cwd.swap(in);
ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
return 0;
}
filepath path;
ldout(cct, 10) << "getcwd " << *cwd << dendl;
- Inode *in = cwd;
+ Inode *in = cwd.get();
while (in != root) {
assert(in->dn_set.size() < 2); // dirs can't be hard-linked
Dentry *dn = in->get_first_parent();
// start over
path = filepath();
- in = cwd;
+ in = cwd.get();
continue;
}
path.push_front_dentry(dn->name);
if (!fh->fcntl_locks && !fh->flock_locks)
return;
- Inode *in = fh->inode;
+ Inode *in = fh->inode.get();
ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
list<pair<int, ceph_filelock> > to_release;
int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
{
- Inode *in = fh->inode;
+ Inode *in = fh->inode.get();
ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
return ret;
int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req)
{
- Inode *in = fh->inode;
+ Inode *in = fh->inode.get();
ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner, fuse_req);
ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
int Client::_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req)
{
- Inode *in = fh->inode;
+ Inode *in = fh->inode.get();
ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
int sleep = !(cmd & LOCK_NB);
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
_fsync(f, true);
_release(in);
{
Mutex::Locker l(client_lock);
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
- Inode *snapdir = open_snapdir(in);
+ Inode *snapdir = open_snapdir(in.get());
return _mkdir(snapdir, name, 0);
}
int Client::rmsnap(const char *relpath, const char *name)
{
Mutex::Locker l(client_lock);
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
- Inode *snapdir = open_snapdir(in);
+ Inode *snapdir = open_snapdir(in.get());
return _rmdir(snapdir, name);
}
Mutex::Locker lock(client_lock);
filepath p(path);
- Inode *in;
+ InodeRef in;
int r = path_walk(p, &in, true);
if (r < 0)
return r;
Inode *in;
vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
if (!inode_map.count(vino)) {
- in = new Inode(cct, vino, &diri->layout);
+ in = new Inode(this, vino, &diri->layout);
in->ino = diri->ino;
in->snapid = CEPH_SNAPDIR;
in->dirfragtree.clear();
inode_map[vino] = in;
in->snapdir_parent = diri;
- diri->get();
ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
} else {
in = inode_map[vino];
tout(cct) << name << std::endl;
string dname(name);
- Inode *in;
+ InodeRef in;
int r = 0;
r = _lookup(parent, dname, &in);
assert(in);
fill_stat(in, attr);
- _ll_get(in);
+ _ll_get(in.get());
out:
ldout(cct, 3) << "ll_lookup " << parent << " " << name
<< " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
tout(cct) << attr->st_ino << std::endl;
- *out = in;
+ *out = in.get();
return r;
}
-int Client::ll_walk(const char* name, Inode **i, struct stat *attr)
+int Client::ll_walk(const char* name, Inode **out, struct stat *attr)
{
Mutex::Locker lock(client_lock);
filepath fp(name, 0);
- Inode *destination = NULL;
+ InodeRef in;
int rc;
ldout(cct, 3) << "ll_walk" << name << dendl;
tout(cct) << "ll_walk" << std::endl;
tout(cct) << name << std::endl;
- rc = path_walk(fp, &destination, false);
- if (rc < 0)
- {
- attr->st_ino = 0;
- *i = NULL;
- return rc;
- }
- else
- {
- fill_stat(destination, attr);
- *i = destination;
- return 0;
- }
+ rc = path_walk(fp, &in, false);
+ if (rc < 0) {
+ attr->st_ino = 0;
+ *out = NULL;
+ return rc;
+ } else {
+ assert(in);
+ fill_stat(in, attr);
+ *out = in.get();
+ return 0;
+ }
}
tout(cct) << attr->st_atime << std::endl;
tout(cct) << mask << std::endl;
- Inode *target = in;
+ InodeRef target(in);
int res = _setattr(in, attr, mask, uid, gid, &target);
if (res == 0) {
- assert(in == target);
+ assert(in == target.get());
fill_stat(in, attr);
}
+
ldout(cct, 3) << "ll_setattr " << vino << " = " << res << dendl;
return res;
}
int Client::getxattr(const char *path, const char *name, void *value, size_t size)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, true);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, true);
if (r < 0)
return r;
- return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid());
+ return Client::_getxattr(in.get(), name, value, size, getuid(), getgid());
}
int Client::lgetxattr(const char *path, const char *name, void *value, size_t size)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, false);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, false);
if (r < 0)
return r;
- return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid());
+ return Client::_getxattr(in.get(), name, value, size, getuid(), getgid());
}
int Client::listxattr(const char *path, char *list, size_t size)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, true);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, true);
if (r < 0)
return r;
- return Client::_listxattr(ceph_inode, list, size, getuid(), getgid());
+ return Client::_listxattr(in.get(), list, size, getuid(), getgid());
}
int Client::llistxattr(const char *path, char *list, size_t size)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, false);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, false);
if (r < 0)
return r;
- return Client::_listxattr(ceph_inode, list, size, getuid(), getgid());
+ return Client::_listxattr(in.get(), list, size, getuid(), getgid());
}
int Client::removexattr(const char *path, const char *name)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, true);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, true);
if (r < 0)
return r;
- return Client::_removexattr(ceph_inode, name, getuid(), getgid());
+ return Client::_removexattr(in.get(), name, getuid(), getgid());
}
int Client::lremovexattr(const char *path, const char *name)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, false);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, false);
if (r < 0)
return r;
- return Client::_removexattr(ceph_inode, name, getuid(), getgid());
+ return Client::_removexattr(in.get(), name, getuid(), getgid());
}
int Client::setxattr(const char *path, const char *name, const void *value, size_t size, int flags)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, true);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, true);
if (r < 0)
return r;
- return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid());
+ return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid());
}
int Client::lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags)
{
Mutex::Locker lock(client_lock);
- Inode *ceph_inode;
- int r = Client::path_walk(path, &ceph_inode, false);
+ InodeRef in;
+ int r = Client::path_walk(path, &in, false);
if (r < 0)
return r;
- return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid());
+ return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid());
}
int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
}
int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
- int uid, int gid, Inode **inp)
+ int uid, int gid, InodeRef *inp)
{
ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
<< mode << dec << ", " << rdev << ", uid " << uid << ", gid "
tout(cct) << mode << std::endl;
tout(cct) << rdev << std::endl;
- Inode *in = NULL;
+ InodeRef in;
int r = _mknod(parent, name, mode, rdev, uid, gid, &in);
if (r == 0) {
fill_stat(in, attr);
- _ll_get(in);
+ _ll_get(in.get());
}
tout(cct) << attr->st_ino << std::endl;
ldout(cct, 3) << "ll_mknod " << vparent << " " << name
<< " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
- *out = in;
+ *out = in.get();
return r;
}
int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
- Inode **inp, Fh **fhp, int stripe_unit, int stripe_count,
+ InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
int object_size, const char *data_pool, bool *created,
int uid, int gid)
{
/* If the caller passed a value in fhp, do the open */
if(fhp) {
(*inp)->get_open_ref(cmode);
- *fhp = _create_fh(*inp, flags, cmode);
+ *fhp = _create_fh(inp->get(), flags, cmode);
}
reply_error:
int Client::_mkdir(Inode *dir, const char *name, mode_t mode, int uid, int gid,
- Inode **inp)
+ InodeRef *inp)
{
ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
<< mode << dec << ", uid " << uid << ", gid " << gid << ")"
tout(cct) << name << std::endl;
tout(cct) << mode << std::endl;
- Inode *in = NULL;
+ InodeRef in;
int r = _mkdir(parent, name, mode, uid, gid, &in);
if (r == 0) {
fill_stat(in, attr);
- _ll_get(in);
+ _ll_get(in.get());
}
tout(cct) << attr->st_ino << std::endl;
ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
<< " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
- *out = in;
+ *out = in.get();
return r;
}
int Client::_symlink(Inode *dir, const char *name, const char *target, int uid,
- int gid, Inode **inp)
+ int gid, InodeRef *inp)
{
ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
<< ", uid " << uid << ", gid " << gid << ")" << dendl;
tout(cct) << name << std::endl;
tout(cct) << value << std::endl;
- Inode *in = NULL;
+ InodeRef in;
int r = _symlink(parent, name, value, uid, gid, &in);
if (r == 0) {
fill_stat(in, attr);
- _ll_get(in);
+ _ll_get(in.get());
}
tout(cct) << attr->st_ino << std::endl;
ldout(cct, 3) << "ll_symlink " << vparent << " " << name
<< " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
- *out = in;
+ *out = in.get();
return r;
}
path.push_dentry(name);
req->set_filepath(path);
+ InodeRef otherin;
+
Dentry *de;
int res = get_or_create(dir, name, &de);
if (res < 0)
req->dentry_drop = CEPH_CAP_FILE_SHARED;
req->dentry_unless = CEPH_CAP_FILE_EXCL;
- Inode *otherin;
res = _lookup(dir, name, &otherin);
if (res < 0)
goto fail;
- req->set_other_inode(otherin);
+ req->set_other_inode(otherin.get());
req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
req->set_inode(dir);
req->dentry_unless = CEPH_CAP_FILE_EXCL;
req->inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+ InodeRef in;
+
Dentry *de;
int res = get_or_create(dir, name, &de);
if (res < 0)
goto fail;
- Inode *in;
res = _lookup(dir, name, &in);
if (res < 0)
goto fail;
if (req->get_op() == CEPH_MDS_OP_RMDIR) {
req->set_dentry(de);
- req->set_inode(in);
+ req->set_other_inode(in.get());
} else {
unlink(de, true, true);
}
return -EXDEV;
}
+ InodeRef target;
MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RENAME);
filepath from;
req->dentry_drop = CEPH_CAP_FILE_SHARED;
req->dentry_unless = CEPH_CAP_FILE_EXCL;
- Inode *oldin;
- res = _lookup(fromdir, fromname, &oldin);
- if (res < 0)
- goto fail;
- req->set_old_inode(oldin);
- req->old_inode_drop = CEPH_CAP_LINK_SHARED;
+ {
+ InodeRef oldin, otherin;
+ res = _lookup(fromdir, fromname, &oldin);
+ if (res < 0)
+ goto fail;
+ req->set_old_inode(oldin.get());
+ req->old_inode_drop = CEPH_CAP_LINK_SHARED;
- Inode *otherin;
- res = _lookup(todir, toname, &otherin);
- if (res != 0 && res != -ENOENT) {
- goto fail;
- } else if (res == 0) {
- req->set_other_inode(otherin);
- req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
- }
+ res = _lookup(todir, toname, &otherin);
+ if (res != 0 && res != -ENOENT) {
+ goto fail;
+ } else if (res == 0) {
+ req->set_other_inode(otherin.get());
+ req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+ }
- req->set_inode(todir);
+ req->set_inode(todir);
+ }
- Inode *target;
res = make_request(req, uid, gid, &target);
-
ldout(cct, 10) << "rename result is " << res << dendl;
// renamed item from our cache
return _rename(parent, name, newparent, newname, uid, gid);
}
-int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, Inode **inp)
+int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, InodeRef *inp)
{
ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
<< " uid " << uid << " gid " << gid << ")" << dendl;
tout(cct) << vnewparent << std::endl;
tout(cct) << newname << std::endl;
- int r = _link(parent, newparent, newname, uid, gid, &parent);
+ InodeRef target;
+ int r = _link(parent, newparent, newname, uid, gid, &target);
if (r == 0) {
- fill_stat(parent, attr);
- _ll_get(parent);
+ assert(target);
+ fill_stat(target, attr);
+ _ll_get(target.get());
}
return r;
}
return 0;
}
+int Client::ll_file_layout(Fh *fh, ceph_file_layout *layout)
+{
+ return ll_file_layout(fh->inode.get(), layout);
+}
+
/* Currently we cannot take advantage of redundancy in reads, since we
would have to go through all possible placement groups (a
potentially quite large number determined by a hash), and use CRUSH
tout(cct) << flags << std::endl;
bool created = false;
- Inode *in = NULL;
+ InodeRef in;
int r = _lookup(parent, name, &in);
if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
0, 0, 0, NULL, &created, uid, gid);
if (r < 0)
goto out;
-
- if ((!in) && fhp)
- in = (*fhp)->inode;
}
if (r < 0)
ldout(cct, 20) << "ll_create created = " << created << dendl;
if (!created) {
- r = check_permissions(in, flags, uid, gid);
+ r = check_permissions(in.get(), flags, uid, gid);
if (r < 0) {
if (fhp && *fhp) {
int release_r = _release_fh(*fhp);
goto out;
}
if (fhp && (*fhp == NULL)) {
- r = _open(in, flags, mode, fhp);
+ r = _open(in.get(), flags, mode, fhp);
if (r < 0)
goto out;
}
// passing an Inode in outp requires an additional ref
if (outp) {
if (in)
- _ll_get(in);
- *outp = in;
+ _ll_get(in.get());
+ *outp = in.get();
}
return r;
if (objecter->osdmap_full_flag() && !(mode & FALLOC_FL_PUNCH_HOLE))
return -ENOSPC;
- Inode *in = fh->inode;
+ Inode *in = fh->inode.get();
if (in->snapid != CEPH_NOSNAP)
return -EROFS;
Mutex::Locker lock(client_lock);
filepath path(relpath);
- Inode *in;
+ InodeRef in;
int r = path_walk(path, &in);
if (r < 0)
return r;
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
*lp = in->layout;
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
vector<ObjectExtent> extents;
Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
// which object?
vector<ObjectExtent> extents;
Fh *f = get_filehandle(fd);
if (!f)
return -EBADF;
- Inode *in = f->inode;
+ Inode *in = f->inode.get();
// map to a list of extents
Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
if (!in->dn_set.empty())
in = in->get_first_parent()->dir->parent_inode;
else if (root_parents.count(in))
- in = root_parents[in];
+ in = root_parents[in].get();
else
in = NULL;
}
cap_epoch_barrier = e;
}
+void intrusive_ptr_add_ref(Inode *in)
+{
+ in->get();
+}
+
+void intrusive_ptr_release(Inode *in)
+{
+ in->client->put_inode(in);
+}
#include "common/Mutex.h"
#include "common/Timer.h"
#include "common/Finisher.h"
-
#include "common/compiler_extensions.h"
#include "common/cmdparse.h"
#include "osdc/ObjectCacher.h"
+#include "InodeRef.h"
+
class MDSMap;
class MonClient;
DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {}
};
-struct Inode;
struct Cap;
class Dir;
class Dentry;
vinodeno_t ino, string& name);
typedef int (*client_remount_callback_t)(void *handle);
-typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
+typedef int (*client_getgroups_callback_t)(void *handle, gid_t **sgids);
typedef void(*client_switch_interrupt_callback_t)(void *req, void *data);
struct client_callback_args {
}
- Inode *inode;
+ InodeRef inode;
int64_t offset; // high bits: frag_t, low bits: an offset
int start_shared_gen; // dir shared_gen at start of readdir
frag_t buffer_frag;
- vector<pair<string,Inode*> > *buffer;
+ vector<pair<string,InodeRef> > *buffer;
string at_cache_name; // last entry we successfully returned
int make_request(MetaRequest *req, int uid, int gid,
//MClientRequest *req, int uid, int gid,
- Inode **ptarget = 0, bool *pcreated = 0,
+ InodeRef *ptarget = 0, bool *pcreated = 0,
int use_mds=-1, bufferlist *pdirbl=0);
void put_request(MetaRequest *request);
int verify_reply_trace(int r, MetaRequest *request, MClientReply *reply,
- Inode **ptarget, bool *pcreated, int uid, int gid);
+ InodeRef *ptarget, bool *pcreated, int uid, int gid);
void encode_cap_releases(MetaRequest *request, mds_rank_t mds);
int encode_inode_release(Inode *in, MetaRequest *req,
mds_rank_t mds, int drop,
public:
entity_name_t get_myname() { return messenger->get_myname(); }
- void sync_write_commit(Inode *in);
+ void sync_write_commit(InodeRef& in);
protected:
Filer *filer;
// cache
ceph::unordered_map<vinodeno_t, Inode*> inode_map;
Inode* root;
- map<Inode*, Inode*> root_parents;
+ map<Inode*, InodeRef> root_parents;
Inode* root_ancestor;
LRU lru; // lru list of Dentry's in our local metadata cache.
friend class C_Client_SyncCommit; // Asserts on client_lock
friend class C_Client_RequestInterrupt;
friend class C_Client_Remount;
+ friend void intrusive_ptr_release(Inode *in);
//int get_cache_size() { return lru.lru_get_size(); }
//void set_cache_size(int m) { lru.lru_set_max(m); }
void unlink(Dentry *dn, bool keepdir, bool keepdentry);
// path traversal for high-level interface
- Inode *cwd;
- int path_walk(const filepath& fp, Inode **end, bool followsym=true);
+ InodeRef cwd;
+ int path_walk(const filepath& fp, InodeRef *end, bool followsym=true);
int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0);
+ int fill_stat(InodeRef& in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0) {
+ return fill_stat(in.get(), st, dirstat, rstat);
+ }
void touch_dn(Dentry *dn);
// trim cache.
void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
void _invalidate_inode_cache(Inode *in);
void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len);
- void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps);
+ void _async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps);
void _release(Inode *in);
/**
// internal interface
// call these with client_lock held!
- int _do_lookup(Inode *dir, const string& name, Inode **target);
- int _lookup(Inode *dir, const string& dname, Inode **target);
+ int _do_lookup(Inode *dir, const string& name, InodeRef *target);
+ int _lookup(Inode *dir, const string& dname, InodeRef *target);
- int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, Inode **inp = 0);
+ int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, InodeRef *inp = 0);
int _unlink(Inode *dir, const char *name, int uid=-1, int gid=-1);
int _rename(Inode *olddir, const char *oname, Inode *ndir, const char *nname, int uid=-1, int gid=-1);
- int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, Inode **inp = 0);
+ int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, InodeRef *inp = 0);
int _rmdir(Inode *dir, const char *name, int uid=-1, int gid=-1);
- int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, Inode **inp = 0);
- int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, Inode **inp = 0);
- int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, Inode **inp = 0);
+ int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, InodeRef *inp = 0);
+ int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, InodeRef *inp = 0);
+ int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0);
+ int _setattr(InodeRef &in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0) {
+ return _setattr(in.get(), attr, mask, uid, gid, inp);
+ }
int _getattr(Inode *in, int mask, int uid=-1, int gid=-1, bool force=false);
+ int _getattr(InodeRef &in, int mask, int uid=-1, int gid=-1, bool force=false) {
+ return _getattr(in.get(), mask, uid, gid, force);
+ }
int _readlink(Inode *in, char *buf, size_t size);
int _getxattr(Inode *in, const char *name, void *value, size_t len, int uid=-1, int gid=-1);
int _listxattr(Inode *in, char *names, size_t len, int uid=-1, int gid=-1);
int _setxattr(Inode *in, const char *name, const void *value, size_t len, int flags, int uid=-1, int gid=-1);
int _removexattr(Inode *in, const char *nm, int uid=-1, int gid=-1);
int _open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid=-1, int gid=-1);
- int _create(Inode *in, const char *name, int flags, mode_t mode, Inode **inp, Fh **fhp,
+ int _create(Inode *in, const char *name, int flags, mode_t mode, InodeRef *inp, Fh **fhp,
int stripe_unit, int stripe_count, int object_size, const char *data_pool,
bool *created = NULL, int uid=-1, int gid=-1);
+
loff_t _lseek(Fh *fh, loff_t offset, int whence);
int _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl);
int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf);
int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner);
int ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req);
int ll_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req);
+ int ll_file_layout(Fh *fh, ceph_file_layout *layout);
void ll_interrupt(void *d);
int ll_get_stripe_osd(struct Inode *in, uint64_t blockno,
ceph_file_layout* layout);
#include "include/xlist.h"
#include "mds/mdstypes.h"
+#include "InodeRef.h"
class Dir;
struct Inode;
class Dentry : public LRUObject {
public:
- string name; // sort of lame
+ string name; // sort of lame
//const char *name;
- Dir *dir;
- Inode *inode;
- int ref; // 1 if there's a dir beneath me.
+ Dir *dir;
+ InodeRef inode;
+ int ref; // 1 if there's a dir beneath me.
uint64_t offset;
mds_rank_t lease_mds;
utime_t lease_ttl;
void dump(Formatter *f) const;
Dentry() :
- dir(0), inode(0), ref(1), offset(0),
+ dir(0), ref(1), offset(0),
lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0),
item_dentry_list(this) { }
private:
#include "common/Readahead.h"
#include "include/types.h"
+#include "InodeRef.h"
-struct Inode;
class Cond;
class ceph_lock_state_t;
// file handle for any open file state
struct Fh {
+ InodeRef inode;
int _ref;
- Inode *inode;
loff_t pos;
int mds; // have to talk to mds we opened with (for now)
int mode; // the mode i opened the file with
ceph_lock_state_t *fcntl_locks;
ceph_lock_state_t *flock_locks;
- Fh() : _ref(1), inode(0), pos(0), mds(0), mode(0), flags(0), pos_locked(false),
+ Fh() : _ref(1), pos(0), mds(0), mode(0), flags(0), pos_locked(false),
readahead(), fcntl_locks(NULL), flock_locks(NULL) {}
void get() { ++_ref; }
int put() { return --_ref; }
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#include "MetaSession.h"
+#include "Client.h"
#include "Inode.h"
#include "Dentry.h"
#include "Dir.h"
+#include "MetaSession.h"
#include "ClientSnapRealm.h"
ostream& operator<<(ostream &out, Inode &in)
if (cap & 1) {
int c = 1 << n;
if (cap_refs[c] <= 0) {
- lderr(cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl;
+ lderr(client->cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl;
assert(cap_refs[c] > 0);
}
if (--cap_refs[c] == 0)
<< "cap expire " << cap->session->cap_ttl << std::endl
<< "cur time " << ceph_clock_now(cct) << std::endl;*/
if ((cap->session->cap_gen <= cap->gen)
- && (ceph_clock_now(cct) < cap->session->cap_ttl)) {
+ && (ceph_clock_now(client->cct) < cap->session->cap_ttl)) {
return true;
}
return true;
{
if (!dir) {
dir = new Dir(this);
- lsubdout(cct, mds, 15) << "open_dir " << dir << " on " << this << dendl;
+ lsubdout(client->cct, client, 15) << "open_dir " << dir << " on " << this << dendl;
assert(dn_set.size() < 2); // dirs can't be hard-linked
if (!dn_set.empty())
(*dn_set.begin())->get(); // pin dentry
return (mode & fmode) == fmode;
}
+void Inode::get() {
+ _ref++;
+ lsubdout(client->cct, client, 15) << "inode.get on " << this << " " << ino << '.' << snapid
+ << " now " << _ref << dendl;
+}
+
+//private method to put a reference; see Client::put_inode()
+int Inode::_put(int n) {
+ _ref -= n;
+ lsubdout(client->cct, client, 15) << "inode.put on " << this << " " << ino << '.' << snapid
+ << " now " << _ref << dendl;
+ assert(_ref >= 0);
+ return _ref;
+}
+
void Inode::dump(Formatter *f) const
{
#include "osdc/ObjectCacher.h"
#include "include/assert.h"
+#include "InodeRef.h"
+
+class Client;
struct MetaSession;
class Dentry;
class Dir;
struct CapSnap {
//snapid_t follows; // map key
- Inode *in;
+ InodeRef in;
SnapContext context;
int issued, dirty;
#define I_DIR_ORDERED 2
struct Inode {
- CephContext *cct;
+ Client *client;
// -- the actual inode --
inodeno_t ino;
SnapRealm *snaprealm;
xlist<Inode*>::item snaprealm_item;
- Inode *snapdir_parent; // only if we are a snapdir inode
+ InodeRef snapdir_parent; // only if we are a snapdir inode
map<snapid_t,CapSnap*> cap_snaps; // pending flush to mds
//int open_by_mode[CEPH_FILE_MODE_NUM];
void make_long_path(filepath& p);
void make_nosnap_relative_path(filepath& p);
- void get() {
- _ref++;
- lsubdout(cct, mds, 15) << "inode.get on " << this << " " << ino << '.' << snapid
- << " now " << _ref << dendl;
- }
- /// private method to put a reference; see Client::put_inode()
- int _put(int n=1) {
- _ref -= n;
- lsubdout(cct, mds, 15) << "inode.put on " << this << " " << ino << '.' << snapid
- << " now " << _ref << dendl;
- assert(_ref >= 0);
- return _ref;
- }
+ void get();
+ int _put(int n=1);
int get_num_ref() {
return _ref;
ceph_lock_state_t *fcntl_locks;
ceph_lock_state_t *flock_locks;
- Inode(CephContext *cct_, vinodeno_t vino, ceph_file_layout *newlayout)
- : cct(cct_), ino(vino.ino), snapid(vino.snapid),
+ Inode(Client *c, vinodeno_t vino, ceph_file_layout *newlayout)
+ : client(c), ino(vino.ino), snapid(vino.snapid),
rdev(0), mode(0), uid(0), gid(0), nlink(0),
size(0), truncate_seq(1), truncate_size(-1),
time_warp_seq(0), max_size(0), version(0), xattr_version(0),
dirty_caps(0), flushing_caps(0), flushing_cap_seq(0), shared_gen(0), cache_gen(0),
snap_caps(0), snap_cap_refs(0),
cap_item(this), flushing_cap_item(this), last_flush_tid(0),
- snaprealm(0), snaprealm_item(this), snapdir_parent(0),
+ snaprealm(0), snaprealm_item(this),
oset((void *)this, newlayout->fl_pg_pool, ino),
reported_size(0), wanted_max_size(0), requested_max_size(0),
_ref(0), ll_ref(0), dir(0), dn_set(),
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLIENT_INODEREF_H
+#define CEPH_CLIENT_INODEREF_H
+
+#include <boost/intrusive_ptr.hpp>
+class Inode;
+void intrusive_ptr_add_ref(Inode *in);
+void intrusive_ptr_release(Inode *in);
+typedef boost::intrusive_ptr<Inode> InodeRef;
+#endif
client/SyntheticClient.h \
client/Trace.h \
client/ioctl.h \
- client/ObjecterWriteback.h
+ client/ObjecterWriteback.h \
+ client/InodeRef.h
if WITH_FUSE
libclient_fuse_la_SOURCES = client/fuse_ll.cc
MetaRequest::~MetaRequest()
{
- assert(!_inode);
- assert(!_old_inode);
- assert(!_other_inode);
if (_dentry)
_dentry->put();
if (_old_dentry)
reply->put();
}
-void MetaRequest::set_inode(Inode *in) {
- assert(_inode == NULL);
- _inode = in;
- _inode->get();
-}
-Inode *MetaRequest::inode() {
- return _inode;
-}
-
-void MetaRequest::set_old_inode(Inode *in) {
- assert(_old_inode == NULL);
- _old_inode = in;
- _old_inode->get();
-}
-Inode *MetaRequest::old_inode() {
- return _old_inode;
-}
-
-void MetaRequest::set_other_inode(Inode *in) {
- assert(_other_inode == NULL);
- _other_inode = in;
- _other_inode->get();
-}
-Inode *MetaRequest::other_inode() {
- return _other_inode;
-}
-
void MetaRequest::set_dentry(Dentry *d) {
assert(_dentry == NULL);
_dentry = d;
#include "include/filepath.h"
#include "include/atomic.h"
#include "mds/mdstypes.h"
+#include "InodeRef.h"
#include "common/Mutex.h"
#include "messages/MClientRequest.h"
class MClientReply;
-struct Inode;
class Dentry;
struct MetaRequest {
private:
- Inode *_inode;
- Inode *_old_inode, *_other_inode;
+ InodeRef _inode, _old_inode, _other_inode;
Dentry *_dentry; //associated with path
Dentry *_old_dentry; //associated with path2
public:
uint64_t readdir_offset;
frag_t readdir_reply_frag;
- vector<pair<string,Inode*> > readdir_result;
+ vector<pair<string,InodeRef> > readdir_result;
bool readdir_end;
int readdir_num;
string readdir_last_name;
Cond *caller_cond; // who to take up
Cond *dispatch_cond; // who to kick back
- Inode *target;
+ InodeRef target;
MetaRequest(int op) :
- _inode(NULL), _old_inode(NULL), _other_inode(NULL),
_dentry(NULL), _old_dentry(NULL),
tid(0),
inode_drop(0), inode_unless(0),
readdir_offset(0), readdir_end(false), readdir_num(0),
got_unsafe(false), item(this), unsafe_item(this),
lock("MetaRequest lock"),
- caller_cond(0), dispatch_cond(0),
- target(0) {
+ caller_cond(0), dispatch_cond(0) {
memset(&head, 0, sizeof(ceph_mds_request_head));
head.op = op;
}
~MetaRequest();
- void set_inode(Inode *in);
- Inode *inode();
- Inode *take_inode() {
- Inode *i = _inode;
- _inode = 0;
- return i;
- }
- void set_old_inode(Inode *in);
- Inode *old_inode();
- Inode *take_old_inode() {
- Inode *i = _old_inode;
- _old_inode = NULL;
- return i;
- }
- void set_other_inode(Inode *in);
- Inode *other_inode();
- Inode *take_other_inode() {
- Inode *i = _other_inode;
- _other_inode = 0;
- return i;
+ void set_inode(Inode *in) {
+ _inode = in;
+ }
+ Inode *inode() {
+ return _inode.get();
+ }
+ void take_inode(InodeRef *out) {
+ out->swap(_inode);
+ }
+ void set_old_inode(Inode *in) {
+ _old_inode = in;
+ }
+ Inode *old_inode() {
+ return _old_inode.get();
+ }
+ void take_old_inode(InodeRef *out) {
+ out->swap(_old_inode);
+ }
+ void set_other_inode(Inode *in) {
+ _old_inode = in;
+ }
+ Inode *other_inode() {
+ return _other_inode.get();
+ }
+ void take_other_inode(InodeRef *out) {
+ out->swap(_other_inode);
}
void set_dentry(Dentry *d);
Dentry *dentry();
ceph::unordered_map<uint64_t,int> snap_stag_map;
ceph::unordered_map<int,uint64_t> stag_snap_map;
+ pthread_key_t fuse_req_key;
+ void set_fuse_req(fuse_req_t);
+ fuse_req_t get_fuse_req();
+
struct fuse_args args;
};
-static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+static CephFuse::Handle *fuse_ll_req_prepare(fuse_req_t req)
{
CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ cfuse->set_fuse_req(req);
+ return cfuse;
+}
+
+static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
struct fuse_entry_param fe;
Inode *i2, *i1 = cfuse->iget(parent); // see below
static void fuse_ll_forget(fuse_req_t req, fuse_ino_t ino,
long unsigned nlookup)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
cfuse->client->ll_forget(cfuse->iget(ino), nlookup+1);
fuse_reply_none(req);
}
static void fuse_ll_getattr(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
struct stat stbuf;
static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
int to_set, struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
const char *value, size_t size, int flags)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
char buf[size];
static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
size_t size)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
char buf[size];
static void fuse_ll_removexattr(fuse_req_t req, fuse_ino_t ino,
const char *name)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
static void fuse_ll_opendir(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
void *dirp;
- int r = cfuse->client->ll_opendir(in, (dir_result_t **) &dirp, ctx->uid,
- ctx->gid);
+ int r = cfuse->client->ll_opendir(in, (dir_result_t **)&dirp,
+ ctx->uid, ctx->gid);
if (r >= 0) {
fi->fh = (long)dirp;
fuse_reply_open(req, fi);
static void fuse_ll_readlink(fuse_req_t req, fuse_ino_t ino)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
char buf[PATH_MAX + 1]; // leave room for a null terminator
static void fuse_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode, dev_t rdev)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *i2, *i1 = cfuse->iget(parent);
struct fuse_entry_param fe;
static void fuse_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *i2, *i1 = cfuse->iget(parent);
struct fuse_entry_param fe;
static void fuse_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(parent);
static void fuse_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(parent);
static void fuse_ll_symlink(fuse_req_t req, const char *existing,
fuse_ino_t parent, const char *name)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *i2, *i1 = cfuse->iget(parent);
struct fuse_entry_param fe;
static void fuse_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
fuse_ino_t newparent, const char *newname)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(parent);
Inode *nin = cfuse->iget(newparent);
static void fuse_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent,
const char *newname)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
Inode *nin = cfuse->iget(newparent);
static void fuse_ll_open(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *in = cfuse->iget(ino);
Fh *fh = NULL;
static void fuse_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
bufferlist bl;
int r = cfuse->client->ll_read(fh, off, size, &bl);
static void fuse_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf,
size_t size, off_t off, struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
int r = cfuse->client->ll_write(fh, off, size, buf);
if (r >= 0)
static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
int r = cfuse->client->ll_flush(fh);
fuse_reply_err(req, -r);
static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi,
unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
if (flags & FUSE_IOCTL_COMPAT) {
fuse_reply_err(req, ENOSYS);
struct ceph_file_layout layout;
struct ceph_ioctl_layout l;
Fh *fh = (Fh*)fi->fh;
- cfuse->client->ll_file_layout(fh->inode, &layout);
+ cfuse->client->ll_file_layout(fh, &layout);
l.stripe_unit = layout.fl_stripe_unit;
l.stripe_count = layout.fl_stripe_count;
l.object_size = layout.fl_object_size;
off_t offset, off_t length,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = (Fh*)fi->fh;
int r = cfuse->client->ll_fallocate(fh, mode, offset, length);
fuse_reply_err(req, -r);
static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
int r = cfuse->client->ll_release(fh);
fuse_reply_err(req, -r);
static void fuse_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
int r = cfuse->client->ll_fsync(fh, datasync);
fuse_reply_err(req, -r);
static void fuse_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
off_t off, struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
dir_result_t *dirp = reinterpret_cast<dir_result_t*>(fi->fh);
cfuse->client->seekdir(dirp, off);
static void fuse_ll_releasedir(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
dir_result_t *dirp = reinterpret_cast<dir_result_t*>(fi->fh);
cfuse->client->ll_releasedir(dirp);
fuse_reply_err(req, 0);
static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode, struct fuse_file_info *fi)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
const struct fuse_ctx *ctx = fuse_req_ctx(req);
Inode *i1 = cfuse->iget(parent), *i2;
struct fuse_entry_param fe;
static void fuse_ll_statfs(fuse_req_t req, fuse_ino_t ino)
{
struct statvfs stbuf;
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Inode *in = cfuse->iget(ino);
int r = cfuse->client->ll_statfs(in, &stbuf);
static void fuse_ll_getlk(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi, struct flock *lock)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
int r = cfuse->client->ll_getlk(fh, lock, fi->lock_owner);
static void fuse_ll_setlk(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi, struct flock *lock, int sleep)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = reinterpret_cast<Fh*>(fi->fh);
// must use multithread if operation may block
static void fuse_ll_interrupt(fuse_req_t req, void* data)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
cfuse->client->ll_interrupt(data);
}
static void fuse_ll_flock(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi, int cmd)
{
- CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+ CephFuse::Handle *cfuse = fuse_ll_req_prepare(req);
Fh *fh = (Fh*)fi->fh;
// must use multithread if operation may block
}
#endif
-#if 0
-static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
+static int getgroups_cb(void *handle, gid_t **sgids)
{
-#ifdef HAVE_FUSE_GETGROUPS
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+ CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
+ fuse_req_t req = cfuse->get_fuse_req();
+
assert(sgids);
- int c = fuse_getgroups(0, NULL);
+ int c = fuse_req_getgroups(req, 0, NULL);
if (c < 0) {
return c;
}
if (!*sgids) {
return -ENOMEM;
}
- c = fuse_getgroups(c, *sgids);
+ c = fuse_req_getgroups(req, c, *sgids);
if (c < 0) {
free(*sgids);
return c;
}
return c;
+#else
+ return -ENOSYS;
#endif
- return 0;
}
-#endif
static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off,
int64_t len)
if (ch)
fuse_unmount(mountpoint, ch);
+ pthread_key_delete(fuse_req_key);
}
int CephFuse::Handle::init(int argc, const char *argv[])
{
+
+ int r = pthread_key_create(&fuse_req_key, NULL);
+ if (r) {
+ derr << "pthread_key_create failed." << dendl;
+ return r;
+ }
+
// set up fuse argc/argv
int newargc = 0;
const char **newargv = (const char **) malloc((argc + 10) * sizeof(char *));
dentry_cb: dentry_invalidate_cb,
switch_intr_cb: switch_interrupt_cb,
remount_cb: remount_cb,
- /*
- * this is broken:
- *
- * - the cb needs the request handle to be useful; we should get the
- * gids in the method here in fuse_ll.c and pass the gid list in,
- * not use a callback.
- * - the callback mallocs the list but it is not free()'d
- *
- * so disable it for now...
- getgroups_cb: getgroups_cb,
- */
+ getgroups_cb: getgroups_cb,
};
client->ll_register_callbacks(&args);
return fino;
}
+void CephFuse::Handle::set_fuse_req(fuse_req_t req)
+{
+ pthread_setspecific(fuse_req_key, (void*)req);
+}
+
+fuse_req_t CephFuse::Handle::get_fuse_req()
+{
+ return (fuse_req_t) pthread_getspecific(fuse_req_key);
+}
+
+
CephFuse::CephFuse(Client *c, int fd) : _handle(new CephFuse::Handle(c, fd))
{
}
#define RBD_SNAP_KEY_PREFIX "snapshot_"
#define RBD_DIR_ID_KEY_PREFIX "id_"
#define RBD_DIR_NAME_KEY_PREFIX "name_"
+#define RBD_MAX_OBJECT_MAP_OBJECT_COUNT 256000000
static int snap_read_header(cls_method_context_t hctx, bufferlist& bl)
{
return -EINVAL;
}
+ // protect against excessive memory requirements
+ if (object_count > RBD_MAX_OBJECT_MAP_OBJECT_COUNT) {
+ CLS_ERR("object map too large: %" PRIu64, object_count);
+ return -EINVAL;
+ }
+
BitVector<2> object_map;
int r = object_map_read(hctx, object_map);
if ((r < 0) && (r != -ENOENT)) {
initialized = true;
}
+ void set_epoch(uint64_t epoch) {
+ instance_entry.versioned_epoch = epoch;
+ }
int unlink_list_entry() {
string list_idx;
return ret;
}
- ret = olh.init(NULL);
+ bool olh_found;
+ ret = olh.init(&olh_found);
if (ret < 0) {
CLS_LOG(0, "ERROR: olh.init() returned ret=%d", ret);
return ret;
}
+ if (!olh_found) {
+ bool instance_only = false;
+ cls_rgw_obj_key key(dest_key.name);
+ ret = convert_plain_entry_to_versioned(hctx, key, true, instance_only);
+ if (ret < 0) {
+ CLS_LOG(0, "ERROR: convert_plain_entry_to_versioned ret=%d", ret);
+ return ret;
+ }
+ olh.update(dest_key, false);
+ olh.set_tag(op.olh_tag);
+
+ obj.set_epoch(1);
+ }
+
if (!olh.start_modify(op.olh_epoch)) {
ret = obj.unlink_list_entry();
if (ret < 0) {
int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid,
const cls_rgw_obj_key& key, const string& op_tag,
- uint64_t olh_epoch, bool log_op)
+ const string& olh_tag, uint64_t olh_epoch, bool log_op)
{
bufferlist in, out;
struct rgw_cls_unlink_instance_op call;
call.key = key;
call.op_tag = op_tag;
call.olh_epoch = olh_epoch;
+ call.olh_tag = olh_tag;
call.log_op = log_op;
::encode(call, in);
int r = io_ctx.exec(oid, "rgw", "bucket_unlink_instance", in, out);
bool delete_marker, const string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
uint64_t olh_epoch, bool log_op);
int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, const string& op_tag,
- uint64_t olh_epoch, bool log_op);
+ const string& olh_tag, uint64_t olh_epoch, bool log_op);
int cls_rgw_get_olh_log(librados::IoCtx& io_ctx, string& oid, librados::ObjectReadOperation& op, const cls_rgw_obj_key& olh, uint64_t ver_marker,
const string& olh_tag,
map<uint64_t, vector<struct rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
uint64_t olh_epoch;
bool log_op;
uint16_t bilog_flags;
+ string olh_tag;
rgw_cls_unlink_instance_op() : olh_epoch(0), log_op(false), bilog_flags(0) {}
void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(key, bl);
::encode(op_tag, bl);
::encode(olh_epoch, bl);
::encode(log_op, bl);
::encode(bilog_flags, bl);
+ ::encode(olh_tag, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(key, bl);
::decode(op_tag, bl);
::decode(olh_epoch, bl);
::decode(log_op, bl);
::decode(bilog_flags, bl);
+ if (struct_v >= 2) {
+ ::decode(olh_tag, bl);
+ }
DECODE_FINISH(bl);
}
#include "Cycles.h"
double Cycles::cycles_per_sec = 0;
-static Initialize _(Cycles::init);
/**
* Perform once-only overall initialization for the Cycles class, such
- * as calibrating the clock frequency. This method is invoked automatically
- * during initialization, but it may be invoked explicitly by other modules
- * to ensure that initialization occurs before those modules initialize
- * themselves.
+ * as calibrating the clock frequency. This method must be called
+ * before using the Cycles module.
+ *
+ * It is not initialized by default because the timing loops cause
+ * general process startup times to balloon
+ * (http://tracker.ceph.com/issues/15225).
*/
void Cycles::init()
{
class PointerWQ : public WorkQueue_ {
public:
PointerWQ(string n, time_t ti, time_t sti, ThreadPool* p)
- : WorkQueue_(n, ti, sti), m_pool(p) {
+ : WorkQueue_(n, ti, sti), m_pool(p), m_processing(0) {
m_pool->add_work_queue(this);
}
~PointerWQ() {
m_pool->remove_work_queue(this);
+ assert(m_processing == 0);
}
void drain() {
+ {
+ // if this queue is empty and not processing, don't wait for other
+ // queues to finish processing
+ Mutex::Locker l(m_pool->_lock);
+ if (m_processing == 0 && m_items.empty()) {
+ return;
+ }
+ }
m_pool->drain(this);
}
void queue(T *item) {
m_items.push_back(item);
m_pool->_cond.SignalOne();
}
+ bool empty() {
+ Mutex::Locker l(m_pool->_lock);
+ return _empty();
+ }
protected:
virtual void _clear() {
assert(m_pool->_lock.is_locked());
return NULL;
}
+ ++m_processing;
T *item = m_items.front();
m_items.pop_front();
return item;
process(reinterpret_cast<T *>(item));
}
virtual void _void_process_finish(void *item) {
+ assert(m_pool->_lock.is_locked());
+ assert(m_processing > 0);
+ --m_processing;
}
virtual void process(T *item) = 0;
private:
ThreadPool *m_pool;
std::list<T *> m_items;
+ uint32_t m_processing;
};
private:
vector<WorkQueue_*> work_queues;
BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
public:
+ static const uint32_t BLOCK_SIZE;
class ConstReference {
public:
};
+template <uint8_t _b>
+const uint32_t BitVector<_b>::BLOCK_SIZE = 4096;
+
template <uint8_t _b>
BitVector<_b>::BitVector() : m_size(0), m_crc_enabled(true)
{
}
m_size = size;
- uint64_t block_count = (buffer_size + CEPH_PAGE_SIZE - 1) / CEPH_PAGE_SIZE;
+ uint64_t block_count = (buffer_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
m_data_crcs.resize(block_count);
}
template <uint8_t _b>
void BitVector<_b>::encode_data(bufferlist& bl, uint64_t byte_offset,
uint64_t byte_length) const {
- assert(byte_offset % CEPH_PAGE_SIZE == 0);
+ assert(byte_offset % BLOCK_SIZE == 0);
assert(byte_offset + byte_length == m_data.length() ||
- byte_length % CEPH_PAGE_SIZE == 0);
+ byte_length % BLOCK_SIZE == 0);
uint64_t end_offset = byte_offset + byte_length;
while (byte_offset < end_offset) {
- uint64_t len = MIN(CEPH_PAGE_SIZE, end_offset - byte_offset);
+ uint64_t len = MIN(BLOCK_SIZE, end_offset - byte_offset);
bufferlist bit;
bit.substr_of(m_data, byte_offset, len);
- m_data_crcs[byte_offset / CEPH_PAGE_SIZE] = bit.crc32c(0);
+ m_data_crcs[byte_offset / BLOCK_SIZE] = bit.crc32c(0);
bl.claim_append(bit);
- byte_offset += CEPH_PAGE_SIZE;
+ byte_offset += BLOCK_SIZE;
}
}
template <uint8_t _b>
void BitVector<_b>::decode_data(bufferlist::iterator& it, uint64_t byte_offset) {
- assert(byte_offset % CEPH_PAGE_SIZE == 0);
+ assert(byte_offset % BLOCK_SIZE == 0);
if (it.end()) {
return;
}
}
while (byte_offset < end_offset) {
- uint64_t len = MIN(CEPH_PAGE_SIZE, end_offset - byte_offset);
+ uint64_t len = MIN(BLOCK_SIZE, end_offset - byte_offset);
bufferlist bit;
it.copy(len, bit);
if (m_crc_enabled &&
- m_data_crcs[byte_offset / CEPH_PAGE_SIZE] != bit.crc32c(0)) {
+ m_data_crcs[byte_offset / BLOCK_SIZE] != bit.crc32c(0)) {
throw buffer::malformed_input("invalid data block CRC");
}
data.append(bit);
void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length,
uint64_t *byte_offset,
uint64_t *byte_length) const {
- // read CEPH_PAGE_SIZE-aligned chunks
+ // read BLOCK_SIZE-aligned chunks
assert(length > 0 && offset + length <= m_size);
uint64_t shift;
compute_index(offset, byte_offset, &shift);
- *byte_offset -= (*byte_offset % CEPH_PAGE_SIZE);
+ *byte_offset -= (*byte_offset % BLOCK_SIZE);
uint64_t end_offset;
compute_index(offset + length - 1, &end_offset, &shift);
- end_offset += (CEPH_PAGE_SIZE - (end_offset % CEPH_PAGE_SIZE));
+ end_offset += (BLOCK_SIZE - (end_offset % BLOCK_SIZE));
assert(*byte_offset <= end_offset);
*byte_length = end_offset - *byte_offset;
throw buffer::malformed_input("incorrect header CRC");
}
- uint64_t block_count = (m_data.length() + CEPH_PAGE_SIZE - 1) / CEPH_PAGE_SIZE;
+ uint64_t block_count = (m_data.length() + BLOCK_SIZE - 1) / BLOCK_SIZE;
::decode(m_data_crcs, footer_it);
if (m_data_crcs.size() != block_count) {
throw buffer::malformed_input("invalid data block CRCs");
_admin_socket->register_command("log dump", "log dump", _admin_hook, "dump recent log entries to log file");
_admin_socket->register_command("log reopen", "log reopen", _admin_hook, "reopen log file");
- _crypto_none = new CryptoNone;
- _crypto_aes = new CryptoAES;
+ _crypto_none = CryptoHandler::create(CEPH_CRYPTO_NONE);
+ _crypto_aes = CryptoHandler::create(CEPH_CRYPTO_AES);
}
CephContext::~CephContext()
struct md_config_t;
class CephContextHook;
class CephContextObs;
-class CryptoNone;
-class CryptoAES;
class CryptoHandler;
namespace ceph {
std::map<std::string, SingletonWrapper*> _associated_objs;
// crypto
- CryptoNone *_crypto_none;
- CryptoAES *_crypto_aes;
+ CryptoHandler *_crypto_none;
+ CryptoHandler *_crypto_aes;
// experimental
CephContextObs *_cct_obs;
OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings
OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
OPTION(mon_accept_timeout, OPT_FLOAT, 10.0) // on leader, if paxos update isn't accepted
+OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin
OPTION(mon_max_log_entries_per_event, OPT_INT, 4096)
OPTION(mon_reweight_min_pgs_per_osd, OPT_U64, 10) // min pgs per osd for reweight-by-pg command
OPTION(mon_reweight_min_bytes_per_osd, OPT_U64, 100*1024*1024) // min bytes per osd for reweight-by-utilization command
+OPTION(mon_reweight_max_osds, OPT_INT, 4) // max osds to change per reweight-by-* command
+OPTION(mon_reweight_max_change, OPT_DOUBLE, 0.05)
OPTION(mon_health_data_update_interval, OPT_FLOAT, 60.0)
OPTION(mon_health_to_clog, OPT_BOOL, true)
OPTION(mon_health_to_clog_interval, OPT_INT, 3600)
OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance
OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions
OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache
+OPTION(rgw_bucket_default_quota_max_objects, OPT_INT, -1) // number of objects allowed
+OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in kB
OPTION(rgw_expose_bucket, OPT_BOOL, false) // Return the bucket name in the 'Bucket' response header
OPTION(rgw_user_quota_sync_interval, OPT_INT, 3600 * 24) // time period for accumulating modified buckets before syncing entire user stats
OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL, false) // whether stats for idle users be fully synced
OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between two full stats sync for non-idle users
+OPTION(rgw_user_default_quota_max_objects, OPT_INT, -1) // number of objects allowed
+OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in kB
OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
{
if (o.is_max())
return out << "MAX";
+ out << o.pool << '/';
out << std::hex << o.get_hash() << std::dec;
+ if (o.nspace.length())
+ out << ":" << o.nspace;
if (o.get_key().length())
out << "." << o.get_key();
out << "/" << o.oid << "/" << o.snap;
- out << "/" << o.nspace << "/" << o.pool;
return out;
}
if (show_time)
return t.localtime(os) << " ";
else
- return os << " ";
+ return os;
}
ostream& ObjBencher::out(ostream& os)
if (i % 20 == 0) {
if (i > 0)
- cur_time.localtime(cout) << "min lat: " << data.min_latency
+ cur_time.localtime(cout) << " min lat: " << data.min_latency
<< " max lat: " << data.max_latency
<< " avg lat: " << data.avg_latency << std::endl;
//I'm naughty and don't reset the fill
bencher->out(cout, cur_time) << setfill(' ')
- << setw(5) << "sec"
- << setw(8) << "Cur ops"
- << setw(10) << "started"
- << setw(10) << "finished"
- << setw(10) << "avg MB/s"
- << setw(10) << "cur MB/s"
- << setw(10) << "last lat"
- << setw(10) << "avg lat" << std::endl;
+ << setw(5) << "sec"
+ << setw(8) << "Cur ops"
+ << setw(10) << "started"
+ << setw(10) << "finished"
+ << setw(10) << "avg MB/s"
+ << setw(10) << "cur MB/s"
+ << setw(12) << "last lat(s)"
+ << setw(12) << "avg lat(s)" << std::endl;
}
if (cycleSinceChange)
bandwidth = (double)(data.finished - previous_writes)
if (previous_writes != data.finished) {
previous_writes = data.finished;
cycleSinceChange = 0;
- bencher->out(cout, cur_time) << setfill(' ')
- << setw(5) << i
- << setw(8) << data.in_flight
- << setw(10) << data.started
- << setw(10) << data.finished
- << setw(10) << avg_bandwidth
- << setw(10) << bandwidth
- << setw(10) << (double)data.cur_latency
- << setw(10) << data.avg_latency << std::endl;
+ bencher->out(cout, cur_time)
+ << setfill(' ')
+ << setw(5) << i
+ << ' ' << setw(7) << data.in_flight
+ << ' ' << setw(9) << data.started
+ << ' ' << setw(9) << data.finished
+ << ' ' << setw(9) << avg_bandwidth
+ << ' ' << setw(9) << bandwidth
+ << ' ' << setw(11) << (double)data.cur_latency
+ << ' ' << setw(11) << data.avg_latency << std::endl;
}
else {
- bencher->out(cout, cur_time) << setfill(' ')
- << setw(5) << i
- << setw(8) << data.in_flight
- << setw(10) << data.started
- << setw(10) << data.finished
- << setw(10) << avg_bandwidth
- << setw(10) << '0'
- << setw(10) << '-'
- << setw(10) << data.avg_latency << std::endl;
+ bencher->out(cout, cur_time)
+ << setfill(' ')
+ << setw(5) << i
+ << ' ' << setw(7) << data.in_flight
+ << ' ' << setw(9) << data.started
+ << ' ' << setw(9) << data.finished
+ << ' ' << setw(9) << avg_bandwidth
+ << ' ' << setw(9) << '0'
+ << ' ' << setw(11) << '-'
+ << ' '<< setw(11) << data.avg_latency << std::endl;
}
++i;
++cycleSinceChange;
double bandwidth;
bandwidth = ((double)data.finished)*((double)data.object_size)/(double)timePassed;
bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
- char bw[20];
- snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth);
out(cout) << "Total time run: " << timePassed << std::endl
<< "Total writes made: " << data.finished << std::endl
<< "Write size: " << data.object_size << std::endl
- << "Bandwidth (MB/sec): " << bw << std::endl
+ << "Bandwidth (MB/sec): " << setprecision(6) << bandwidth << std::endl
<< "Stddev Bandwidth: " << vec_stddev(data.history.bandwidth) << std::endl
<< "Max bandwidth (MB/sec): " << data.idata.max_bandwidth << std::endl
<< "Min bandwidth (MB/sec): " << data.idata.min_bandwidth << std::endl
- << "Average Latency: " << data.avg_latency << std::endl
- << "Stddev Latency: " << vec_stddev(data.history.latency) << std::endl
- << "Max latency: " << data.max_latency << std::endl
- << "Min latency: " << data.min_latency << std::endl;
+ << "Average IOPS: " << (int)(data.finished/timePassed) << std::endl
+ << "Average Latency(s): " << data.avg_latency << std::endl
+ << "Stddev Latency(s): " << vec_stddev(data.history.latency) << std::endl
+ << "Max latency(s): " << data.max_latency << std::endl
+ << "Min latency(s): " << data.min_latency << std::endl;
//write object size/number data for read benchmarks
::encode(data.object_size, b_write);
index[slot] = data.started;
lock.Unlock();
completion_wait(slot);
+ lock.Lock();
r = completion_ret(slot);
if (r < 0) {
cerr << "read got " << r << std::endl;
lock.Unlock();
goto ERR;
}
- lock.Lock();
total_latency += data.cur_latency;
if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
lock.Lock();
++data.started;
++data.in_flight;
- lock.Unlock();
- if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
- cerr << name[slot] << " is not correct!" << std::endl;
- ++errors;
- } else {
- lock.Unlock();
- }
-
+ lock.Unlock();
name[slot] = newName;
}
double bandwidth;
bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime;
bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
- char bw[20];
- snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth);
- out(cout) << "Total time run: " << runtime << std::endl
+ out(cout) << "Total time run: " << runtime << std::endl
<< "Total reads made: " << data.finished << std::endl
<< "Read size: " << data.object_size << std::endl
- << "Bandwidth (MB/sec): " << bw << std::endl
- << "Average Latency: " << data.avg_latency << std::endl
- << "Max latency: " << data.max_latency << std::endl
- << "Min latency: " << data.min_latency << std::endl;
+ << "Bandwidth (MB/sec): " << setprecision(6) << bandwidth << std::endl
+ << "Average IOPS: " << (int)(data.finished/runtime) << std::endl
+ << "Average Latency(s): " << data.avg_latency << std::endl
+ << "Max latency(s): " << data.max_latency << std::endl
+ << "Min latency(s): " << data.min_latency << std::endl;
completions_done();
- return 0;
+ return (errors > 0 ? -EIO : 0);
ERR:
lock.Lock();
double bandwidth;
bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime;
bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
- char bw[20];
- snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth);
- out(cout) << "Total time run: " << runtime << std::endl
+ out(cout) << "Total time run: " << runtime << std::endl
<< "Total reads made: " << data.finished << std::endl
<< "Read size: " << data.object_size << std::endl
- << "Bandwidth (MB/sec): " << bw << std::endl
- << "Average Latency: " << data.avg_latency << std::endl
- << "Max latency: " << data.max_latency << std::endl
- << "Min latency: " << data.min_latency << std::endl;
+ << "Bandwidth (MB/sec): " << setprecision(6) << bandwidth << std::endl
+ << "Average IOPS: " << (int)(data.finished/runtime) << std::endl
+ << "Average Latency(s): " << data.avg_latency << std::endl
+ << "Max latency(s): " << data.max_latency << std::endl
+ << "Min latency(s): " << data.min_latency << std::endl;
completions_done();
- return 0;
+ return (errors > 0 ? -EIO : 0);
ERR:
lock.Lock();
#include "strtol.h"
-#include <errno.h>
-#include <limits.h>
+#include <cerrno>
+#include <climits>
+#include <cstdlib>
#include <sstream>
-#include <stdlib.h>
using std::ostringstream;
return ret;
}
-uint64_t strict_sistrtoll(const char *str, std::string *err)
+template<typename T>
+T strict_si_cast(const char *str, std::string *err)
{
std::string s(str);
if (s.empty()) {
*err = "strict_sistrtoll: value not specified";
return 0;
}
- const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
+ const char &u = *s.rbegin();
int m = 0;
if (u == 'B')
m = 0;
else
m = -1;
- const char *v = NULL;
if (m >= 0)
- s = std::string(str, s.size()-1);
- v = s.c_str();
-
- long long r_ll = strict_strtoll(v, 10, err);
+ s.erase(s.size()-1);
+ else
+ m = 0;
- if (r_ll < 0) {
+ long long ll = strict_strtoll(s.c_str(), 10, err);
+ if (ll < 0 && !std::numeric_limits<T>::is_signed) {
*err = "strict_sistrtoll: value should not be negative";
return 0;
}
+ if (ll < (long long)std::numeric_limits<T>::min() >> m) {
+ *err = "strict_sistrtoll: value seems to be too small";
+ return 0;
+ }
+ if (ll > std::numeric_limits<T>::max() >> m) {
+ *err = "strict_sistrtoll: value seems to be too large";
+ return 0;
- uint64_t r = r_ll;
- if (err->empty() && m > 0) {
- if (r > (std::numeric_limits<uint64_t>::max() >> m)) {
- *err = "strict_sistrtoll: value seems to be too large";
- return 0;
- }
- r <<= m;
}
- return r;
+ return (ll << m);
}
-template <>
-uint64_t strict_si_cast(const char *str, std::string *err) {
- return strict_sistrtoll(str, err);
+template int strict_si_cast<int>(const char *str, std::string *err);
+
+template long long strict_si_cast<long long>(const char *str, std::string *err);
+
+template uint64_t strict_si_cast<uint64_t>(const char *str, std::string *err);
+
+uint64_t strict_sistrtoll(const char *str, std::string *err)
+{
+ return strict_si_cast<uint64_t>(str, err);
}
uint64_t strict_sistrtoll(const char *str, std::string *err);
-template <typename Target>
-Target strict_si_cast(const char *str, std::string *err) {
- uint64_t ret = strict_sistrtoll(str, err);
- if (!err->empty())
- return ret;
- if (ret > (uint64_t)std::numeric_limits<Target>::max()) {
- err->append("The option value '");
- err->append(str);
- err->append("' seems to be too large");
- return 0;
- }
- return ret;
-}
-
-template <>
-uint64_t strict_si_cast(const char *str, std::string *err);
+template<typename T>
+T strict_si_cast(const char *str, std::string *err);
#endif
output_ceph_version();
}
-static void pidfile_remove_void(void)
-{
- pidfile_remove();
-}
-
-int global_init_prefork(CephContext *cct, int flags)
+int global_init_prefork(CephContext *cct, int)
{
if (g_code_env != CODE_ENVIRONMENT_DAEMON)
return -1;
+
const md_config_t *conf = cct->_conf;
if (!conf->daemonize) {
- if (atexit(pidfile_remove_void)) {
- derr << "global_init_daemonize: failed to set pidfile_remove function "
- << "to run at exit." << dendl;
- }
- pidfile_write(g_conf);
+ if (pidfile_write(g_conf) < 0)
+ exit(1);
return -1;
}
<< cpp_strerror(ret) << dendl;
exit(1);
}
-
+
global_init_postfork_start(cct);
global_init_postfork_finish(cct, flags);
}
// restart log thread
g_ceph_context->_log->start();
- if (atexit(pidfile_remove_void)) {
- derr << "global_init_daemonize: failed to set pidfile_remove function "
- << "to run at exit." << dendl;
- }
-
/* This is the old trick where we make file descriptors 0, 1, and possibly 2
* point to /dev/null.
*
exit(1);
}
- pidfile_write(g_conf);
+ if (pidfile_write(g_conf) < 0)
+ exit(1);
}
void global_init_postfork_finish(CephContext *cct, int flags)
#include "include/compat.h"
+//
+// derr can be used for functions exclusively called from pidfile_write
+//
+// cerr must be used for functions called by pidfile_remove because
+// logging is not functional when it is called. cerr output is lost
+// when the caller is daemonized but it will show if not (-f)
+//
#define dout_prefix *_dout
-static char pid_file[PATH_MAX] = "";
+struct pidfh {
+ int pf_fd;
+ char pf_path[PATH_MAX + 1];
+ dev_t pf_dev;
+ ino_t pf_ino;
-int pidfile_write(const md_config_t *conf)
-{
- int ret, fd;
-
- if (conf->pid_file.empty()) {
- return pidfile_remove();
+ pidfh() {
+ reset();
}
- snprintf(pid_file, PATH_MAX, "%s", conf->pid_file.c_str());
-
- fd = TEMP_FAILURE_RETRY(::open(pid_file,
- O_CREAT|O_TRUNC|O_WRONLY, 0644));
- if (fd < 0) {
- int err = errno;
- derr << "write_pid_file: failed to open pid file '"
- << pid_file << "': " << cpp_strerror(err) << dendl;
- return err;
+ ~pidfh() {
+ remove();
}
- char buf[20];
- int len = snprintf(buf, sizeof(buf), "%d\n", getpid());
- ret = safe_write(fd, buf, len);
- if (ret < 0) {
- derr << "write_pid_file: failed to write to pid file '"
- << pid_file << "': " << cpp_strerror(ret) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return ret;
+ bool is_open() {
+ return pf_path[0] != '\0' && pf_fd != -1;
}
- if (TEMP_FAILURE_RETRY(::close(fd))) {
- ret = errno;
- derr << "SimpleMessenger::write_pid_file: failed to close to pid file '"
- << pid_file << "': " << cpp_strerror(ret) << dendl;
- return -ret;
+ void reset() {
+ pf_fd = -1;
+ memset(pf_path, 0, sizeof(pf_path));
+ pf_dev = 0;
+ pf_ino = 0;
}
+ int verify();
+ int remove();
+ int open(const md_config_t *conf);
+ int write();
+};
+
+static pidfh *pfh = NULL;
+int pidfh::verify() {
+ // check that the file we opened still is the same
+ if (pf_fd == -1)
+ return -EINVAL;
+ struct stat st;
+ if (stat(pf_path, &st) == -1)
+ return -errno;
+ if (st.st_dev != pf_dev || st.st_ino != pf_ino)
+ return -ESTALE;
return 0;
}
-int pidfile_remove(void)
+int pidfh::remove()
{
- if (!pid_file[0])
+ if (!pf_path[0])
return 0;
- // only remove it if it has OUR pid in it!
- int fd = TEMP_FAILURE_RETRY(::open(pid_file, O_RDONLY));
- if (fd < 0)
+ int ret;
+ if ((ret = verify()) < 0) {
+ if (pf_fd != -1) {
+ ::close(pf_fd);
+ reset();
+ }
+ return ret;
+ }
+
+ // seek to the beginning of the file before reading
+ ret = ::lseek(pf_fd, 0, SEEK_SET);
+ if (ret < 0) {
+ std::cerr << __func__ << " lseek failed "
+ << cpp_strerror(errno) << std::endl;
return -errno;
+ }
+
+ // check that the pid file still has our pid in it
char buf[32];
memset(buf, 0, sizeof(buf));
- ssize_t res = safe_read(fd, buf, sizeof(buf));
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- if (res < 0)
+ ssize_t res = safe_read(pf_fd, buf, sizeof(buf));
+ ::close(pf_fd);
+ if (res < 0) {
+ std::cerr << __func__ << " safe_read failed "
+ << cpp_strerror(-res) << std::endl;
return res;
+ }
+
int a = atoi(buf);
- if (a != getpid())
+ if (a != getpid()) {
+ std::cerr << __func__ << " the pid found in the file is "
+ << a << " which is different from getpid() "
+ << getpid() << std::endl;
return -EDOM;
+ }
+ ret = ::unlink(pf_path);
+ if (ret < 0) {
+ std::cerr << __func__ << " unlink " << pf_path << " failed "
+ << cpp_strerror(errno) << std::endl;
+ return -errno;
+ }
+ reset();
+ return 0;
+}
+
+int pidfh::open(const md_config_t *conf)
+{
+ int len = snprintf(pf_path, sizeof(pf_path),
+ "%s", conf->pid_file.c_str());
+
+ if (len >= (int)sizeof(pf_path))
+ return -ENAMETOOLONG;
+
+ int fd;
+ fd = ::open(pf_path, O_CREAT|O_RDWR, 0644);
+ if (fd < 0) {
+ int err = errno;
+ derr << __func__ << ": failed to open pid file '"
+ << pf_path << "': " << cpp_strerror(err) << dendl;
+ reset();
+ return -err;
+ }
+ struct stat st;
+ if (fstat(fd, &st) == -1) {
+ int err = errno;
+ derr << __func__ << ": failed to fstat pid file '"
+ << pf_path << "': " << cpp_strerror(err) << dendl;
+ ::close(fd);
+ reset();
+ return -err;
+ }
+
+ pf_fd = fd;
+ pf_dev = st.st_dev;
+ pf_ino = st.st_ino;
- res = ::unlink(pid_file);
- if (res)
+ struct flock l = { F_WRLCK, SEEK_SET, 0, 0, 0 };
+ int r = ::fcntl(pf_fd, F_SETLK, &l);
+ if (r < 0) {
+ derr << __func__ << ": failed to lock pidfile "
+ << pf_path << " because another process locked it." << dendl;
+ ::close(pf_fd);
+ reset();
+ return -errno;
+ }
+ return 0;
+}
+
+int pidfh::write()
+{
+ if (!is_open())
+ return 0;
+
+ char buf[32];
+ int len = snprintf(buf, sizeof(buf), "%d\n", getpid());
+ if (::ftruncate(pf_fd, 0) < 0) {
+ int err = errno;
+ derr << __func__ << ": failed to ftruncate the pid file '"
+ << pf_path << "': " << cpp_strerror(err) << dendl;
+ return err;
+ }
+ ssize_t res = safe_write(pf_fd, buf, len);
+ if (res < 0) {
+ derr << __func__ << ": failed to write to pid file '"
+ << pf_path << "': " << cpp_strerror(-res) << dendl;
return res;
+ }
+ return 0;
+}
+
+void pidfile_remove()
+{
+ delete pfh;
+ pfh = NULL;
+}
+
+int pidfile_write(const md_config_t *conf)
+{
+ if (conf->pid_file.empty())
+ return 0;
+
+ assert(!pfh);
+
+ pfh = new pidfh();
+ if (atexit(pidfile_remove)) {
+ derr << __func__ << ": failed to set pidfile_remove function "
+ << "to run at exit." << dendl;
+ return -EINVAL;
+ }
+
+ int r = pfh->open(conf);
+ if (r != 0) {
+ pidfile_remove();
+ return r;
+ }
+
+ r = pfh->write();
+ if (r != 0) {
+ pidfile_remove();
+ return r;
+ }
- pid_file[0] = '\0';
return 0;
}
// Remove the pid file that was previously written by pidfile_write.
// This is safe to call in a signal handler context.
-int pidfile_remove(void);
+void pidfile_remove();
#endif
for (map<uint64_t,string>::const_iterator p = names.begin();
p != names.end();
++p) {
- char s[10];
+ char s[18];
snprintf(s, sizeof(s), "feature_%lld", (unsigned long long)p->first);
f->dump_string(s, p->second);
}
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
/* ... */
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
CEPH_FEATURE_MDS_QUOTA | \
CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \
+ CEPH_FEATURE_OSD_HITSET_GMT | \
CEPH_FEATURE_HAMMER_0_94_4 | \
0ULL)
# Description: Enable Ceph distributed file system services.
### END INIT INFO
-. /lib/lsb/init-functions
+# TODO: on FreeBSD/OSX, use equivalent script file
+if [ -e /lib/lsb/init-functions ]; then
+ . /lib/lsb/init-functions
+fi
# detect systemd, also check whether the systemd-run binary exists
SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
void C_CacheRead::complete(int r) {
if (!m_enqueued) {
// cache_lock creates a lock ordering issue -- so re-execute this context
- // outside the cache_lock
+ // outside the cache_lock. use the writeback handler's dedicated thread
+ // to avoid blocking a dependent operation
m_enqueued = true;
- m_image_ctx.op_work_queue->queue(this, r);
+ m_image_ctx.writeback_handler->queue(this, r);
return;
}
Context::complete(r);
#include "librbd/AsyncOperation.h"
#include "librbd/ImageCtx.h"
#include "common/dout.h"
+#include "common/WorkQueue.h"
#include "include/assert.h"
#define dout_subsys ceph_subsys_rbd
}
};
+struct C_FlushCache : public Context {
+ ImageCtx *image_ctx;
+ Context *on_safe;
+
+ C_FlushCache(ImageCtx *_image_ctx, Context *_on_safe)
+ : image_ctx(_image_ctx), on_safe(_on_safe) {
+ }
+ virtual void finish(int r) {
+ // successful cache flush indicates all IO is now safe
+ RWLock::RLocker owner_locker(image_ctx->owner_lock);
+ image_ctx->flush_cache(on_safe);
+ }
+};
+
+struct C_InvalidateCache : public Context {
+ ImageCtx *image_ctx;
+ bool purge_on_error;
+ bool reentrant_safe;
+ Context *on_finish;
+
+ C_InvalidateCache(ImageCtx *_image_ctx, bool _purge_on_error,
+ bool _reentrant_safe, Context *_on_finish)
+ : image_ctx(_image_ctx), purge_on_error(_purge_on_error),
+ reentrant_safe(_reentrant_safe), on_finish(_on_finish) {
+ }
+ virtual void finish(int r) {
+ assert(image_ctx->cache_lock.is_locked());
+ CephContext *cct = image_ctx->cct;
+
+ if (r == -EBLACKLISTED) {
+ lderr(cct) << "Blacklisted during flush! Purging cache..." << dendl;
+ image_ctx->object_cacher->purge_set(image_ctx->object_set);
+ } else if (r != 0 && purge_on_error) {
+ lderr(cct) << "invalidate cache encountered error "
+ << cpp_strerror(r) << " !Purging cache..." << dendl;
+ image_ctx->object_cacher->purge_set(image_ctx->object_set);
+ } else if (r != 0) {
+ lderr(cct) << "flush_cache returned " << r << dendl;
+ }
+
+ loff_t unclean = image_ctx->object_cacher->release_set(
+ image_ctx->object_set);
+ if (unclean == 0) {
+ r = 0;
+ } else {
+ lderr(cct) << "could not release all objects from cache: "
+ << unclean << " bytes remain" << dendl;
+ r = -EBUSY;
+ }
+
+ if (reentrant_safe) {
+ on_finish->complete(r);
+ } else {
+ image_ctx->op_work_queue->queue(on_finish, r);
+ }
+ }
+
+};
+
+struct C_AsyncCallback : public Context {
+ ImageCtx *image_ctx;
+ Context *on_finish;
+ C_AsyncCallback(ImageCtx *image_ctx, Context *on_finish)
+ : image_ctx(image_ctx), on_finish(on_finish) {
+ }
+ virtual void finish(int r) {
+ image_ctx->op_work_queue->queue(on_finish, r);
+ }
+};
+
+void _flush_async_operations(ImageCtx *ictx, Context *on_finish) {
+ {
+ Mutex::Locker async_ops_locker(ictx->async_ops_lock);
+ if (!ictx->async_ops.empty()) {
+ ldout(ictx->cct, 20) << "flush async operations: " << on_finish << " "
+ << "count=" << ictx->async_ops.size() << dendl;
+ ictx->async_ops.front()->add_flush_context(on_finish);
+ return;
+ }
+ }
+ on_finish->complete(0);
+}
+
} // anonymous namespace
ImageCtx::ImageCtx(const string &image_name, const string &image_id,
}
delete[] format_string;
+ md_ctx.aio_flush();
+ data_ctx.aio_flush();
+ op_work_queue->drain();
+ aio_work_queue->drain();
+
delete op_work_queue;
delete aio_work_queue;
}
}
}
- void ImageCtx::flush_cache_aio(Context *onfinish) {
+ int ImageCtx::flush_cache() {
+ C_SaferCond cond_ctx;
+ flush_cache(&cond_ctx);
+
+ ldout(cct, 20) << "waiting for cache to be flushed" << dendl;
+ int r = cond_ctx.wait();
+ ldout(cct, 20) << "finished flushing cache" << dendl;
+
+ return r;
+ }
+
+ void ImageCtx::flush_cache(Context *onfinish) {
assert(owner_lock.is_locked());
cache_lock.Lock();
object_cacher->flush_set(object_set, onfinish);
cache_lock.Unlock();
}
- int ImageCtx::flush_cache() {
- int r = 0;
- Mutex mylock("librbd::ImageCtx::flush_cache");
- Cond cond;
- bool done;
- Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r);
- flush_cache_aio(onfinish);
- mylock.Lock();
- while (!done) {
- ldout(cct, 20) << "waiting for cache to be flushed" << dendl;
- cond.Wait(mylock);
- }
- mylock.Unlock();
- ldout(cct, 20) << "finished flushing cache" << dendl;
- return r;
- }
-
- void ImageCtx::shutdown_cache() {
+ int ImageCtx::shutdown_cache() {
flush_async_operations();
RWLock::RLocker owner_locker(owner_lock);
- invalidate_cache(true);
+ int r = invalidate_cache(true);
object_cacher->stop();
+ return r;
}
int ImageCtx::invalidate_cache(bool purge_on_error) {
- int result;
- C_SaferCond ctx;
- invalidate_cache(&ctx);
- result = ctx.wait();
-
- if (result && purge_on_error) {
- cache_lock.Lock();
- if (object_cacher != NULL) {
- lderr(cct) << "invalidate cache met error " << cpp_strerror(result) << " !Purging cache..." << dendl;
- object_cacher->purge_set(object_set);
- }
- cache_lock.Unlock();
+ flush_async_operations();
+ if (object_cacher == NULL) {
+ return 0;
}
+ cache_lock.Lock();
+ object_cacher->release_set(object_set);
+ cache_lock.Unlock();
+
+ C_SaferCond ctx;
+ flush_cache(new C_InvalidateCache(this, purge_on_error, true, &ctx));
+
+ int result = ctx.wait();
return result;
}
object_cacher->release_set(object_set);
cache_lock.Unlock();
- flush_cache_aio(new FunctionContext(boost::bind(
- &ImageCtx::invalidate_cache_completion, this, _1, on_finish)));
- }
-
- void ImageCtx::invalidate_cache_completion(int r, Context *on_finish) {
- assert(cache_lock.is_locked());
- if (r == -EBLACKLISTED) {
- lderr(cct) << "Blacklisted during flush! Purging cache..." << dendl;
- object_cacher->purge_set(object_set);
- } else if (r != 0) {
- lderr(cct) << "flush_cache returned " << r << dendl;
- }
-
- loff_t unclean = object_cacher->release_set(object_set);
- if (unclean == 0) {
- r = 0;
- } else {
- lderr(cct) << "could not release all objects from cache: "
- << unclean << " bytes remain" << dendl;
- r = -EBUSY;
- }
-
- op_work_queue->queue(on_finish, r);
+ flush_cache(new C_InvalidateCache(this, false, false, on_finish));
}
void ImageCtx::clear_nonexistence_cache() {
void ImageCtx::flush_async_operations() {
C_SaferCond ctx;
- flush_async_operations(&ctx);
+ _flush_async_operations(this, &ctx);
ctx.wait();
}
void ImageCtx::flush_async_operations(Context *on_finish) {
- Mutex::Locker l(async_ops_lock);
- if (async_ops.empty()) {
- on_finish->complete(0);
- return;
+ // complete context in clean thread context
+ _flush_async_operations(this, new C_AsyncCallback(this, on_finish));
+ }
+
+ int ImageCtx::flush() {
+ assert(owner_lock.is_locked());
+
+ flush_async_operations();
+ if (object_cacher != NULL) {
+ int r = flush_cache();
+ if (r < 0) {
+ return r;
+ }
}
+ return 0;
+ }
- ldout(cct, 20) << "flush async operations: " << on_finish << " "
- << "count=" << async_ops.size() << dendl;
- async_ops.front()->add_flush_context(on_finish);
+ void ImageCtx::flush(Context *on_safe) {
+ assert(owner_lock.is_locked());
+ if (object_cacher != NULL) {
+ // flush cache after completing all in-flight AIO ops
+ on_safe = new C_FlushCache(this, on_safe);
+ }
+ flush_async_operations(on_safe);
}
void ImageCtx::cancel_async_requests() {
void write_to_cache(object_t o, const bufferlist& bl, size_t len,
uint64_t off, Context *onfinish, int fadvise_flags);
void user_flushed();
- void flush_cache_aio(Context *onfinish);
int flush_cache();
- void shutdown_cache();
+ void flush_cache(Context *onfinish);
+ int shutdown_cache();
int invalidate_cache(bool purge_on_error=false);
void invalidate_cache(Context *on_finish);
- void invalidate_cache_completion(int r, Context *on_finish);
void clear_nonexistence_cache();
int register_watch();
void unregister_watch();
void flush_async_operations();
void flush_async_operations(Context *on_finish);
+ int flush();
+ void flush(Context *on_safe);
+
void cancel_async_requests();
};
}
{
RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
RWLock::WLocker md_locker(m_image_ctx.md_lock);
- librbd::_flush(&m_image_ctx);
+ m_image_ctx.flush();
}
m_image_ctx.owner_lock.get_write();
delete m_finisher;
}
+ void LibrbdWriteback::queue(Context *ctx, int r) {
+ m_finisher->queue(ctx, r);
+ }
+
void LibrbdWriteback::read(const object_t& oid, uint64_t object_no,
const object_locator_t& oloc,
uint64_t off, uint64_t len, snapid_t snapid,
{
if (!m_ictx->object_map.object_may_exist(object_no)) {
- m_finisher->queue(req, -ENOENT);
+ queue(req, -ENOENT);
return;
}
}
LibrbdWriteback(ImageCtx *ictx, Mutex& lock);
virtual ~LibrbdWriteback();
+ void queue(Context *ctx, int r);
+
// Note that oloc, trunc_size, and trunc_seq are ignored
virtual void read(const object_t& oid, uint64_t object_no,
const object_locator_t& oloc, uint64_t off, uint64_t len,
rados_completion->release();
}
+ int rollback_parent(ImageCtx *ictx, uint64_t snap_id)
+ {
+ assert(ictx);
+ assert(ictx->parent_lock.is_locked());
+ assert(ictx->snap_lock.is_locked());
+
+ CephContext *cct = ictx->cct;
+ int r = 0;
+ std::map<librados::snap_t, SnapInfo>::const_iterator it = ictx->snap_info.find(snap_id);
+ if (it == ictx->snap_info.end()) {
+ ldout(cct, 10) << __func__ << ": no such snapshot: " << snap_id << dendl;
+ return -ENOENT;
+ }
+ const SnapInfo& snap_info(it->second);
+ if (ictx->parent_md == snap_info.parent) {
+ ldout(cct, 20) << __func__ << ": nop: head and snapshot have the same parent" << dendl;
+ return 0;
+ }
+ if (ictx->parent_md.spec.pool_id != -1) {
+ // remove the old parent link first, otherwise cls_client::set_parent
+ // will fail with -EEXISTS
+ ldout(cct, 20) << __func__ << ": removing the old parent link" << dendl;
+ r = cls_client::remove_parent(&ictx->md_ctx, ictx->header_oid);
+ if (r < 0) {
+ ldout(cct, 10) << __func__ << ": failed to remove parent link: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ if (snap_info.parent.spec.pool_id != -1) {
+ ldout(cct, 20) << __func__ << ": updating the parent link" << dendl;
+ r = cls_client::set_parent(&ictx->md_ctx, ictx->header_oid,
+ snap_info.parent.spec, snap_info.parent.overlap);
+ if (r < 0) {
+ ldout(cct, 10) << __func__ << ": failed to set parent link: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ return 0;
+ }
+
int rollback_image(ImageCtx *ictx, uint64_t snap_id,
ProgressContext& prog_ctx)
{
RWLock::WLocker l(ictx->snap_lock);
ictx->object_map.rollback(snap_id);
}
+
+ {
+ RWLock::WLocker snap_locker(ictx->snap_lock);
+ RWLock::WLocker parent_locker(ictx->parent_lock);
+ r = rollback_parent(ictx, snap_id);
+ if (r < 0) {
+ ldout(cct, 10) << __func__ << ": failed to rollback the parent link: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
return 0;
}
}
RWLock::WLocker md_locker(ictx->md_lock);
- r = _flush(ictx);
+ r = ictx->flush();
if (r < 0) {
return r;
}
} // release snap_lock and cache_lock
if (new_snap) {
- _flush(ictx);
+ ictx->flush();
}
ictx->refresh_lock.Lock();
// writes might create new snapshots. Rolling back will replace
// the current version, so we have to invalidate that too.
RWLock::WLocker md_locker(ictx->md_lock);
- ictx->flush_async_operations();
r = ictx->invalidate_cache();
if (r < 0) {
return r;
// get -EROFS for writes
RWLock::RLocker owner_locker(ictx->owner_lock);
RWLock::WLocker md_locker(ictx->md_lock);
- ictx->flush_cache();
+ ictx->flush();
}
int r = _snap_set(ictx, snap_name);
if (r < 0) {
// ensure previous writes are visible to listsnaps
{
RWLock::RLocker owner_locker(ictx->owner_lock);
- _flush(ictx);
+ ictx->flush();
}
int r = ictx_check(ictx);
C_AioWrite *flush_ctx = new C_AioWrite(cct, c);
c->add_request();
- ictx->flush_async_operations(flush_ctx);
+ ictx->flush(flush_ctx);
c->init_time(ictx, AIO_TYPE_FLUSH);
- C_AioWrite *req_comp = new C_AioWrite(cct, c);
- c->add_request();
- if (ictx->object_cacher) {
- ictx->flush_cache_aio(req_comp);
- } else {
- librados::AioCompletion *rados_completion =
- librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb);
- ictx->data_ctx.aio_flush_async(rados_completion);
- rados_completion->release();
- }
c->finish_adding_requests(cct);
c->put();
ictx->perfcounter->inc(l_librbd_aio_flush);
ictx->user_flushed();
{
RWLock::RLocker owner_locker(ictx->owner_lock);
- r = _flush(ictx);
+ r = ictx->flush();
}
ictx->perfcounter->inc(l_librbd_flush);
return r;
}
- int _flush(ImageCtx *ictx)
- {
- assert(ictx->owner_lock.is_locked());
- CephContext *cct = ictx->cct;
- int r;
- // flush any outstanding writes
- if (ictx->object_cacher) {
- r = ictx->flush_cache();
- } else {
- r = ictx->data_ctx.aio_flush();
- ictx->flush_async_operations();
- }
-
- if (r)
- lderr(cct) << "_flush " << ictx << " r = " << r << dendl;
-
- return r;
- }
-
int invalidate_cache(ImageCtx *ictx)
{
CephContext *cct = ictx->cct;
return r;
}
- ictx->flush_async_operations();
-
RWLock::RLocker owner_locker(ictx->owner_lock);
RWLock::WLocker md_locker(ictx->md_lock);
r = ictx->invalidate_cache();
char *buf, bufferlist *pbl, AioCompletion *c, int op_flags);
void aio_flush(ImageCtx *ictx, AioCompletion *c);
int flush(ImageCtx *ictx);
- int _flush(ImageCtx *ictx);
int invalidate_cache(ImageCtx *ictx);
ssize_t handle_sparse_read(CephContext *cct,
parent_spec() : pool_id(-1), snap_id(CEPH_NOSNAP) {}
parent_spec(uint64_t pool_id, string image_id, snapid_t snap_id) :
pool_id(pool_id), image_id(image_id), snap_id(snap_id) {}
- bool operator==(const parent_spec &other) {
+ bool operator==(const parent_spec &other) const {
return ((this->pool_id == other.pool_id) &&
(this->image_id == other.image_id) &&
(this->snap_id == other.snap_id));
}
- bool operator!=(const parent_spec &other) {
+ bool operator!=(const parent_spec &other) const {
return !(*this == other);
}
};
parent_spec spec;
uint64_t overlap;
parent_info() : overlap(0) {}
+ bool operator==(const parent_info &other) const {
+ return (spec == other.spec) && (overlap == other.overlap);
+ }
+ bool operator!=(const parent_info &other) const {
+ return (spec != other.spec) || (overlap != other.overlap);
+ }
};
}
m_flush_mutex_holder(0),
m_new(), m_recent(),
m_fd(-1),
+ m_fd_last_error(0),
m_syslog_log(-2), m_syslog_crash(-2),
m_stderr_log(1), m_stderr_crash(-1),
m_stop(false),
r = safe_write(m_fd, s.data(), s.size());
if (r >= 0)
r = write(m_fd, "\n", 1);
- if (r < 0)
- cerr << "problem writing to " << m_log_file << ": " << cpp_strerror(r) << std::endl;
+ if (r != m_fd_last_error) {
+ if (r < 0)
+ cerr << "problem writing to " << m_log_file
+ << ": " << cpp_strerror(r)
+ << std::endl;
+ m_fd_last_error = r;
+ }
}
if (do_syslog) {
std::string m_log_file;
int m_fd;
+ int m_fd_last_error; ///< last error we say writing to fd (if any)
+
int m_syslog_log, m_syslog_crash;
int m_stderr_log, m_stderr_crash;
done
done
fi
+ killall -q -1 ceph-fuse || true
endscript
missingok
notifempty
dout(10) << " realm " << *realm << dendl;
const SnapContext *snapc = &realm->get_snap_context();
- uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
- (uint64_t)in->inode.layout.fl_stripe_count;
uint64_t to = in->inode.get_max_size();
to = MAX(in->inode.size, to);
// when truncating a file, the filer does not delete stripe objects that are
// truncated to zero. so we need to purge stripe objects up to the max size
// the file has ever been.
to = MAX(in->inode.max_size_ever, to);
- if (period && to > period) {
- uint64_t num = (to - 1) / period;
+ if (to > 0) {
+ uint64_t num = Striper::get_num_objects(in->inode.layout, to);
dout(10) << "purge_stray 0~" << to << " objects 0~" << num
- << " snapc " << snapc << " on " << *in << dendl;
- mds->filer->purge_range(in->ino(), &in->inode.layout, *snapc,
- 1, num, ceph_clock_now(g_ceph_context),
- 0, gather.new_sub());
- }
+ << " snapc " << snapc << " on " << *in << dendl;
- // keep backtrace object
- if (period && to > 0) {
+ // keep backtrace object
+ if (num > 1) {
+ mds->filer->purge_range(in->ino(), &in->inode.layout, *snapc,
+ 1, num - 1, ceph_clock_now(g_ceph_context),
+ 0, gather.new_sub());
+ }
mds->filer->zero(in->ino(), &in->inode.layout, *snapc,
- 0, period, ceph_clock_now(g_ceph_context),
+ 0, (uint64_t)in->inode.layout.fl_object_size,
+ ceph_clock_now(g_ceph_context),
0, true, NULL, gather.new_sub());
}
}
if (in->is_file()) {
- uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
- (uint64_t)in->inode.layout.fl_stripe_count;
uint64_t to = in->inode.get_max_size();
to = MAX(in->inode.size, to);
// when truncating a file, the filer does not delete stripe objects that are
// truncated to zero. so we need to purge stripe objects up to the max size
// the file has ever been.
to = MAX(in->inode.max_size_ever, to);
- if (to && period) {
- uint64_t num = (to + period - 1) / period;
+ if (to > 0) {
+ uint64_t num = Striper::get_num_objects(in->inode.layout, to);
dout(10) << "purge_stray 0~" << to << " objects 0~" << num
<< " snapc " << snapc << " on " << *in << dendl;
mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
f->close_section();
f->open_object_section("up");
for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
- char s[10];
+ char s[14];
sprintf(s, "mds_%d", int(p->first));
f->dump_int(s, p->second);
}
return;
}
- // can only open a dir with mode FILE_MODE_PIN, at least for now.
- if (cur->inode.is_dir())
+ if (!cur->inode.is_file()) {
+ // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
cmode = CEPH_FILE_MODE_PIN;
+ // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
+ if (cur->inode.is_symlink() && !(flags & O_NOFOLLOW))
+ flags &= ~O_TRUNC;
+ }
dout(10) << "open flags = " << flags
<< ", filemode = " << cmode
respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
return;
}*/
- if ((req->head.args.open.flags & O_DIRECTORY) && !cur->inode.is_dir()) {
+ if ((req->head.args.open.flags & O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
respond_to_request(mdr, -EINVAL);
return;
}
+ if ((flags & O_TRUNC) && !cur->inode.is_file()) {
+ dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
+ // we should return -EISDIR for directory, return -EINVAL for other non-regular
+ respond_to_request(mdr, cur->inode.is_dir() ? EISDIR : -EINVAL);
+ return;
+ }
+
if (cur->inode.inline_version != CEPH_INLINE_NONE &&
!mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
dout(7) << "old client cannot open inline data file " << *cur << dendl;
return mon->store->get(STORE_PREFIX, key, bl);
}
+void ConfigKeyService::get_store_prefixes(set<string>& s)
+{
+ s.insert(STORE_PREFIX);
+}
+
void ConfigKeyService::store_put(string key, bufferlist &bl, Context *cb)
{
bufferlist proposal_bl;
virtual string get_name() const {
return "config_key";
}
-
+ virtual void get_store_prefixes(set<string>& s);
/**
* @} // ConfigKeyService_Inherited_h
*/
if (channels.do_log_to_syslog(channel)) {
string level = channels.get_level(channel);
- string facility = channels.get_facility(facility);
+ string facility = channels.get_facility(channel);
if (level.empty() || facility.empty()) {
derr << __func__ << " unable to log to syslog -- level or facility"
<< " not defined (level: " << level << ", facility: "
if (err == -ENOENT) {
r = -ENOENT;
} else {
- assert(r == 0);
+ assert(err == 0);
assert(b.length());
MDSMap mm;
mm.decode(b);
waiting_for_session.pop_front();
}
+ if (cur_con)
+ cur_con->mark_down();
+ cur_con.reset(NULL);
+ cur_mon.clear();
+
monc_lock.Unlock();
if (initialized) {
monc_lock.Lock();
timer.shutdown();
- if (cur_con)
- cur_con->mark_down();
- cur_con.reset(NULL);
- cur_mon.clear();
-
monc_lock.Unlock();
}
if (ret == 0) {
if (state != MC_STATE_HAVE_SESSION) {
state = MC_STATE_HAVE_SESSION;
+ last_rotating_renew_sent = utime_t();
while (!waiting_for_session.empty()) {
_send_mon_message(waiting_for_session.front());
waiting_for_session.pop_front();
return 0;
}
- utime_t cutoff = ceph_clock_now(cct);
+ utime_t now = ceph_clock_now(cct);
+ utime_t cutoff = now;
cutoff -= MIN(30.0, cct->_conf->auth_service_ticket_ttl / 4.0);
+ utime_t issued_at_lower_bound = now;
+ issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl;
if (!rotating_secrets->need_new_secrets(cutoff)) {
ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl;
rotating_secrets->dump_rotating();
}
ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl;
+ if (!rotating_secrets->need_new_secrets() &&
+ rotating_secrets->need_new_secrets(issued_at_lower_bound)) {
+ // the key has expired before it has been issued?
+ lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early"
+ << " (before " << issued_at_lower_bound << ")" << dendl;
+ }
+ if ((now > last_rotating_renew_sent) &&
+ double(now - last_rotating_renew_sent) < 1) {
+ ldout(cct, 10) << __func__ << " called too often (last: "
+ << last_rotating_renew_sent << "), skipping refresh" << dendl;
+ return 0;
+ }
MAuth *m = new MAuth;
m->protocol = auth->get_protocol();
if (auth->build_rotating_request(m->auth_payload)) {
+ last_rotating_renew_sent = now;
_send_mon_message(m);
} else {
m->put();
int MonClient::wait_auth_rotating(double timeout)
{
Mutex::Locker l(monc_lock);
- utime_t until = ceph_clock_now(cct);
+ utime_t now = ceph_clock_now(cct);
+ utime_t until = now;
until += timeout;
if (auth->get_protocol() == CEPH_AUTH_NONE)
return 0;
while (auth_principal_needs_rotating_keys(entity_name) &&
- rotating_secrets->need_new_secrets()) {
- utime_t now = ceph_clock_now(cct);
+ rotating_secrets->need_new_secrets(now)) {
if (now >= until) {
ldout(cct, 0) << "wait_auth_rotating timed out after " << timeout << dendl;
return -ETIMEDOUT;
}
ldout(cct, 10) << "wait_auth_rotating waiting (until " << until << ")" << dendl;
auth_cond.WaitUntil(monc_lock, until);
+ now = ceph_clock_now(cct);
}
ldout(cct, 10) << "wait_auth_rotating done" << dendl;
return 0;
int authenticate_err;
list<Message*> waiting_for_session;
+ utime_t last_rotating_renew_sent;
Context *session_established_context;
bool had_a_connection;
double reopen_interval_multiplier;
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
"name=val,type=CephString " \
"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
"name=name,type=CephString,req=false",
"obtain stats from all pools, or from specified pool",
"osd", "r", "cli,rest")
+COMMAND("osd utilization",
+ "get basic pg distribution stats",
+ "osd", "r", "cli,rest")
COMMAND("osd reweight-by-utilization " \
- "name=oload,type=CephInt,range=100,req=false", \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\
"reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
"osd", "rw", "cli,rest")
+COMMAND("osd test-reweight-by-utilization " \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\
+ "dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
+ "osd", "rw", "cli,rest")
COMMAND("osd reweight-by-pg " \
- "name=oload,type=CephInt,range=100 " \
- "name=pools,type=CephPoolname,n=N,req=false", \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=pools,type=CephPoolname,n=N,req=false", \
"reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
"osd", "rw", "cli,rest")
+COMMAND("osd test-reweight-by-pg " \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=pools,type=CephPoolname,n=N,req=false", \
+ "dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
+ "osd", "rw", "cli,rest")
COMMAND("osd thrash " \
"name=num_epochs,type=CephInt,range=0", \
"thrash OSDs for <num_epochs>", "osd", "rw", "cli,rest")
timecheck_round(0),
timecheck_acks(0),
+ timecheck_rounds_since_clean(0),
timecheck_event(NULL),
probe_timeout_event(NULL),
targets.insert(paxos->get_name());
for (int i = 0; i < PAXOS_NUM; ++i)
paxos_service[i]->get_store_prefixes(targets);
-
+ ConfigKeyService *config_key_service_ptr = dynamic_cast<ConfigKeyService*>(config_key_service);
+ assert(config_key_service_ptr);
+ config_key_service_ptr->get_store_prefixes(targets);
return targets;
}
} else {
if (paxos->get_version() < m->paxos_first_version &&
m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1.
- dout(10) << " peer paxos versions [" << m->paxos_first_version
+ dout(10) << " peer paxos first versions [" << m->paxos_first_version
<< "," << m->paxos_last_version << "]"
<< " vs my version " << paxos->get_version()
<< " (too far ahead)"
return;
}
if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) {
- dout(10) << " peer paxos version " << m->paxos_last_version
+ dout(10) << " peer paxos last version " << m->paxos_last_version
<< " vs my version " << paxos->get_version()
<< " (too far ahead)"
<< dendl;
return;
}
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ // check return value. If no prefix parameter provided,
+ // return value will be false, then return error info.
+ if(!cmd_getval(g_ceph_context, cmdmap, "prefix", prefix)) {
+ reply_command(m, -EINVAL, "command prefix not found", 0);
+ return;
+ }
+
+ // check prefix is empty
+ if (prefix.empty()) {
+ reply_command(m, -EINVAL, "command prefix must not be empty", 0);
+ return;
+ }
+
if (prefix == "get_command_descriptions") {
bufferlist rdata;
Formatter *f = Formatter::create("json");
boost::scoped_ptr<Formatter> f(Formatter::create(format));
get_str_vec(prefix, fullcmd);
+
+ // make sure fullcmd is not empty.
+ // invalid prefix will cause empty vector fullcmd.
+ // such as, prefix=";,,;"
+ if (fullcmd.empty()) {
+ reply_command(m, -EINVAL, "command requires a prefix to be valid", 0);
+ return;
+ }
+
module = fullcmd[0];
// validate command is in leader map
timecheck();
out:
dout(10) << __func__ << " setting up next event" << dendl;
- timecheck_event = new C_TimeCheck(this);
- timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event);
+ timecheck_reset_event();
}
void Monitor::timecheck_finish_round(bool success)
assert(timecheck_waiting.empty());
assert(timecheck_acks == quorum.size());
timecheck_report();
+ timecheck_check_skews();
return;
}
timecheck_waiting.clear();
timecheck_skews.clear();
timecheck_latencies.clear();
+
+ timecheck_rounds_since_clean = 0;
+}
+
+void Monitor::timecheck_reset_event()
+{
+ if (timecheck_event) {
+ timer.cancel_event(timecheck_event);
+ timecheck_event = NULL;
+ }
+
+ double delay =
+ cct->_conf->mon_timecheck_skew_interval * timecheck_rounds_since_clean;
+
+ if (delay <= 0 || delay > cct->_conf->mon_timecheck_interval) {
+ delay = cct->_conf->mon_timecheck_interval;
+ }
+
+ dout(10) << __func__ << " delay " << delay
+ << " rounds_since_clean " << timecheck_rounds_since_clean
+ << dendl;
+
+ timecheck_event = new C_TimeCheck(this);
+ timer.add_event_after(delay, timecheck_event);
+}
+
+void Monitor::timecheck_check_skews()
+{
+ dout(10) << __func__ << dendl;
+ assert(is_leader());
+ assert((timecheck_round % 2) == 0);
+ if (monmap->size() == 1) {
+ assert(0 == "We are alone; we shouldn't have gotten here!");
+ return;
+ }
+ assert(timecheck_latencies.size() == timecheck_skews.size());
+
+ bool found_skew = false;
+ for (map<entity_inst_t, double>::iterator p = timecheck_skews.begin();
+ p != timecheck_skews.end(); ++p) {
+
+ double abs_skew;
+ if (timecheck_has_skew(p->second, &abs_skew)) {
+ dout(10) << __func__
+ << " " << p->first << " skew " << abs_skew << dendl;
+ found_skew = true;
+ }
+ }
+
+ if (found_skew) {
+ ++timecheck_rounds_since_clean;
+ timecheck_reset_event();
+ } else if (timecheck_rounds_since_clean > 0) {
+ dout(1) << __func__
+ << " no clock skews found after " << timecheck_rounds_since_clean
+ << " rounds" << dendl;
+ // make sure the skews are really gone and not just a transient success
+ // this will run just once if not in the presence of skews again.
+ timecheck_rounds_since_clean = 1;
+ timecheck_reset_event();
+ timecheck_rounds_since_clean = 0;
+ }
+
}
void Monitor::timecheck_report()
m->epoch = get_epoch();
m->round = timecheck_round;
- for (map<entity_inst_t, double>::iterator it = timecheck_skews.begin(); it != timecheck_skews.end(); ++it) {
+ for (map<entity_inst_t, double>::iterator it = timecheck_skews.begin();
+ it != timecheck_skews.end(); ++it) {
double skew = it->second;
double latency = timecheck_latencies[it->first];
const double latency)
{
health_status_t status = HEALTH_OK;
- double abs_skew = (skew_bound > 0 ? skew_bound : -skew_bound);
assert(latency >= 0);
- if (abs_skew > g_conf->mon_clock_drift_allowed) {
+ double abs_skew;
+ if (timecheck_has_skew(skew_bound, &abs_skew)) {
status = HEALTH_WARN;
ss << "clock skew " << abs_skew << "s"
<< " > max " << g_conf->mon_clock_drift_allowed << "s";
<< " delta " << delta << " skew_bound " << skew_bound
<< " latency " << latency << dendl;
- if (timecheck_skews.count(other) == 0) {
- timecheck_skews[other] = skew_bound;
- } else {
- timecheck_skews[other] = (timecheck_skews[other]*0.8)+(skew_bound*0.2);
- }
+ timecheck_skews[other] = skew_bound;
timecheck_acks++;
if (timecheck_acks == quorum.size()) {
os << g_conf->mon_data << "/keyring";
int err = 0;
- int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT, 0644);
+ int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT, 0600);
if (fd < 0) {
err = -errno;
dout(0) << __func__ << " failed to open " << os.str()
#include "include/memory.h"
#include "include/str_map.h"
#include <errno.h>
+#include <cmath>
#define CEPH_MON_PROTOCOL 13 /* cluster internal */
version_t timecheck_round;
unsigned int timecheck_acks;
utime_t timecheck_round_start;
+ /* When we hit a skew we will start a new round based off of
+ * 'mon_timecheck_skew_interval'. Each new round will be backed off
+ * until we hit 'mon_timecheck_interval' -- which is the typical
+ * interval when not in the presence of a skew.
+ *
+ * This variable tracks the number of rounds with skews since last clean
+ * so that we can report to the user and properly adjust the backoff.
+ */
+ uint64_t timecheck_rounds_since_clean;
/**
* Time Check event.
*/
void timecheck_finish_round(bool success = true);
void timecheck_cancel_round();
void timecheck_cleanup();
+ void timecheck_reset_event();
+ void timecheck_check_skews();
void timecheck_report();
void timecheck();
health_status_t timecheck_status(ostringstream &ss,
void handle_timecheck_leader(MTimeCheck *m);
void handle_timecheck_peon(MTimeCheck *m);
void handle_timecheck(MTimeCheck *m);
+
+ /**
+ * Returns 'true' if this is considered to be a skew; 'false' otherwise.
+ */
+ bool timecheck_has_skew(const double skew_bound, double *abs) const {
+ double abs_skew = std::fabs(skew_bound);
+ if (abs)
+ *abs = abs_skew;
+ return (abs_skew > g_conf->mon_clock_drift_allowed);
+ }
/**
* @}
*/
*
*/
+#include <algorithm>
#include <sstream>
#include "OSDMonitor.h"
mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
}
+struct Sorter {
+
+ double average_util;
+
+ Sorter(const double average_util_)
+ : average_util(average_util_)
+ {}
+
+ bool operator()(std::pair<int,float> l, std::pair<int,float> r) {
+ return abs(l.second - average_util) > abs(r.second - average_util);
+ }
+};
+
/* Assign a lower weight to overloaded OSDs.
*
* The osds that will get a lower weight are those with with a utilization
* percentage 'oload' percent greater than the average utilization.
*/
-int OSDMonitor::reweight_by_utilization(int oload, std::string& out_str,
- bool by_pg, const set<int64_t> *pools)
+int OSDMonitor::reweight_by_utilization(int oload,
+ double max_changef,
+ int max_osds,
+ bool by_pg, const set<int64_t> *pools,
+ bool no_increasing,
+ bool dry_run,
+ std::stringstream *ss,
+ std::string *out_str,
+ Formatter *f)
{
if (oload <= 100) {
- ostringstream oss;
- oss << "You must give a percentage higher than 100. "
+ *ss << "You must give a percentage higher than 100. "
"The reweighting threshold will be calculated as <average-utilization> "
"times <input-percentage>. For example, an argument of 200 would "
"reweight OSDs which are twice as utilized as the average OSD.\n";
- out_str = oss.str();
return -EINVAL;
}
}
if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
- ostringstream oss;
- oss << "Refusing to reweight: we only have " << num_pg_copies
+ *ss << "Refusing to reweight: we only have " << num_pg_copies
<< " PGs across " << num_osds << " osds!\n";
- out_str = oss.str();
return -EDOM;
}
if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
< g_conf->mon_reweight_min_bytes_per_osd) {
ostringstream oss;
- oss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
+ *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
<< " kb across all osds!\n";
- out_str = oss.str();
return -EDOM;
}
if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
< g_conf->mon_reweight_min_bytes_per_osd) {
ostringstream oss;
- oss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
+ *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
<< " kb used across all osds!\n";
- out_str = oss.str();
return -EDOM;
}
// but aggressively adjust weights up whenever possible.
double underload_util = average_util;
+ unsigned max_change = (unsigned)(max_changef * (double)0x10000);
+
ostringstream oss;
- char buf[128];
- snprintf(buf, sizeof(buf), "average %04f, overload %04f. ",
- average_util, overload_util);
- oss << buf;
- std::string sep;
- oss << "reweighted: ";
+ if (f) {
+ f->open_object_section("reweight_by_utilization");
+ f->dump_unsigned("overload_min", oload);
+ f->dump_float("max_change", max_changef);
+ f->dump_float("max_change_osds", max_osds);
+ f->dump_float("average_utilization", average_util);
+ f->dump_float("overload_utilization", overload_util);
+ } else {
+ oss << "oload " << oload << "\n";
+ oss << "max_change " << max_changef << "\n";
+ oss << "max_change_osds " << max_osds << "\n";
+ char buf[128];
+ snprintf(buf, sizeof(buf), "average %04f\noverload %04f\n",
+ average_util, overload_util);
+ oss << buf;
+ }
bool changed = false;
+ int num_changed = 0;
+
+ // precompute util for each OSD
+ std::vector<std::pair<int, float> > util_by_osd;
for (ceph::unordered_map<int,osd_stat_t>::const_iterator p =
- pgm.osd_stat.begin();
+ pgm.osd_stat.begin();
p != pgm.osd_stat.end();
++p) {
- float util;
+ std::pair<int, float> osd_util;
+ osd_util.first = p->first;
if (by_pg) {
- util = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first);
+ osd_util.second = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first);
} else {
- util = (double)p->second.kb_used / (double)p->second.kb;
+ osd_util.second = (double)p->second.kb_used / (double)p->second.kb;
}
+ util_by_osd.push_back(osd_util);
+ }
+
+ // sort by absolute deviation from the mean utilization,
+ // in descending order.
+ std::sort(util_by_osd.begin(), util_by_osd.end(), Sorter(average_util));
+
+ OSDMap::Incremental newinc;
+
+ if (f)
+ f->open_array_section("reweights");
+
+ for (std::vector<std::pair<int, float> >::const_iterator p =
+ util_by_osd.begin();
+ p != util_by_osd.end();
+ ++p) {
+ float util = p->second;
+
if (util >= overload_util) {
- sep = ", ";
// Assign a lower weight to overloaded OSDs. The current weight
// is a factor to take into account the original weights,
// to represent e.g. differing storage capacities
unsigned weight = osdmap.get_weight(p->first);
unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
- pending_inc.new_weight[p->first] = new_weight;
- char buf[128];
- snprintf(buf, sizeof(buf), "osd.%d [%04f -> %04f]", p->first,
- (float)weight / (float)0x10000,
- (float)new_weight / (float)0x10000);
- oss << buf << sep;
- changed = true;
+ if (weight > max_change)
+ new_weight = MAX(new_weight, weight - max_change);
+ newinc.new_weight[p->first] = new_weight;
+ if (!dry_run) {
+ pending_inc.new_weight[p->first] = new_weight;
+ changed = true;
+ }
+ if (f) {
+ f->open_object_section("osd");
+ f->dump_unsigned("osd", p->first);
+ f->dump_float("weight", (float)weight / (float)0x10000);
+ f->dump_float("new_weight", (float)new_weight / (float)0x10000);
+ f->close_section();
+ } else {
+ char buf[128];
+ snprintf(buf, sizeof(buf), "osd.%d weight %04f -> %04f\n", p->first,
+ (float)weight / (float)0x10000,
+ (float)new_weight / (float)0x10000);
+ oss << buf;
+ }
+ if (++num_changed >= max_osds)
+ break;
}
- if (util <= underload_util) {
+ if (!no_increasing && util <= underload_util) {
// assign a higher weight.. if we can.
unsigned weight = osdmap.get_weight(p->first);
unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+ new_weight = MIN(new_weight, weight + max_change);
if (new_weight > 0x10000)
new_weight = 0x10000;
if (new_weight > weight) {
- sep = ", ";
- pending_inc.new_weight[p->first] = new_weight;
+ newinc.new_weight[p->first] = new_weight;
+ if (!dry_run) {
+ pending_inc.new_weight[p->first] = new_weight;
+ changed = true;
+ }
char buf[128];
- snprintf(buf, sizeof(buf), "osd.%d [%04f -> %04f]", p->first,
+ snprintf(buf, sizeof(buf), "osd.%d weight %04f -> %04f\n", p->first,
(float)weight / (float)0x10000,
(float)new_weight / (float)0x10000);
- oss << buf << sep;
- changed = true;
+ oss << buf;
+ if (++num_changed >= max_osds)
+ break;
}
}
}
- if (sep.empty()) {
- oss << "(none)";
+ if (f) {
+ f->close_section();
+ }
+
+ OSDMap newmap;
+ newmap.deepish_copy_from(osdmap);
+ newinc.fsid = newmap.fsid;
+ newinc.epoch = newmap.get_epoch() + 1;
+ newmap.apply_incremental(newinc);
+
+ osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
+
+ if (f) {
+ f->close_section();
+ } else {
+ *out_str += "\n";
+ *out_str += oss.str();
}
- out_str = oss.str();
dout(10) << "reweight_by_utilization: finished with " << out_str << dendl;
return changed;
}
bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
{
+ // already pending failure?
+ if (pending_inc.new_state.count(target_osd) &&
+ pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+ dout(10) << " already pending failure" << dendl;
+ return true;
+ }
+
utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
utime_t max_failed_since = fi.get_failed_since();
utime_t failed_for = now - max_failed_since;
<< grace << " grace (" << orig_grace << " + " << my_grace << " + " << peer_grace << "), max_failed_since " << max_failed_since
<< dendl;
- // already pending failure?
- if (pending_inc.new_state.count(target_osd) &&
- pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
- dout(10) << " already pending failure" << dendl;
- return true;
- }
-
if (failed_for >= grace &&
((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters) &&
(fi.num_reports >= g_conf->mon_osd_min_down_reports)) {
failure_info.clear();
}
+static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
+ return pool.second.use_gmt_hitset;
+}
// boot --
}
}
+ if (std::find_if(osdmap.get_pools().begin(),
+ osdmap.get_pools().end(),
+ uses_gmt_hitset) != osdmap.get_pools().end()) {
+ assert(osdmap.get_num_up_osds() == 0 ||
+ osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+ if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+ dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+ << m->get_orig_source_inst()
+ << " doesn't announce support -- ignore" << dendl;
+ goto ignore;
+ }
+ }
+
// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
continue;
}
+ int acting_primary = -1;
+ osdmap.pg_to_up_acting_osds(
+ p->first, NULL, NULL, NULL, &acting_primary);
+ if (acting_primary != from) {
+ /* If the source isn't the primary based on the current osdmap, we know
+ * that the interval changed and that we can discard this message.
+ * Indeed, we must do so to avoid 16127 since we can't otherwise determine
+ * which of two pg temp mappings on the same pg is more recent.
+ */
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": primary has changed" << dendl;
+ ignore_cnt++;
+ continue;
+ }
+
// removal?
if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
osdmap.primary_temp->count(p->first)))
ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
rdata.append(ds);
}
+ } else if (prefix == "osd utilization") {
+ string out;
+ osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
+ if (f)
+ f->flush(rdata);
+ else
+ rdata.append(out);
+ r = 0;
+ goto reply;
} else if (prefix == "osd find") {
int64_t osd;
if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
if (!p->is_tier() &&
(var == "hit_set_type" || var == "hit_set_period" ||
var == "hit_set_count" || var == "hit_set_fpp" ||
+ var == "use_gmt_hitset" ||
var == "target_max_objects" || var == "target_max_bytes" ||
var == "cache_target_full_ratio" ||
var == "cache_target_dirty_ratio" ||
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
f->dump_float("hit_set_fpp", bloomp->get_fpp());
}
+ } else if (var == "use_gmt_hitset") {
+ f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
} else if (var == "target_max_objects") {
f->dump_unsigned("target_max_objects", p->target_max_objects);
} else if (var == "target_max_bytes") {
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
ss << "hit_set_fpp: " << bloomp->get_fpp();
+ } else if (var == "use_gmt_hitset") {
+ ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
} else if (var == "target_max_objects") {
ss << "target_max_objects: " << p->target_max_objects;
} else if (var == "target_max_bytes") {
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
- float warn_threshold = g_conf->mon_pool_quota_warn_threshold/100;
- float crit_threshold = g_conf->mon_pool_quota_crit_threshold/100;
+ float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
+ float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
if (pool.quota_max_objects > 0) {
stringstream ss;
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
if (g_conf->osd_pool_default_flag_nosizechange)
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+ if (g_conf->osd_pool_use_gmt_hitset &&
+ (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+ pi->use_gmt_hitset = true;
+ else
+ pi->use_gmt_hitset = false;
pi->size = size;
pi->min_size = min_size;
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
bloomp->set_fpp(f);
+ } else if (var == "use_gmt_hitset") {
+ if (val == "true" || (interr.empty() && n == 1)) {
+ if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
+ ss << "not all OSDs support GMT hit set.";
+ return -EINVAL;
+ }
+ p.use_gmt_hitset = true;
+ } else {
+ ss << "expecting value 'true' or '1'";
+ return -EINVAL;
+ }
} else if (var == "debug_fake_ec_pool") {
if (val == "true" || (interr.empty() && n == 1)) {
p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL;
get_last_committed() + 1));
return true;
- } else if (prefix == "osd reweight-by-utilization") {
- int64_t oload;
- cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
- string out_str;
- err = reweight_by_utilization(oload, out_str, false, NULL);
- if (err < 0) {
- ss << "FAILED reweight-by-utilization: " << out_str;
- } else if (err == 0) {
- ss << "no change: " << out_str;
- } else {
- ss << "SUCCESSFUL reweight-by-utilization: " << out_str;
- getline(ss, rs);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
- get_last_committed() + 1));
- return true;
- }
- } else if (prefix == "osd reweight-by-pg") {
+ } else if (prefix == "osd reweight-by-pg" ||
+ prefix == "osd reweight-by-utilization" ||
+ prefix == "osd test-reweight-by-pg" ||
+ prefix == "osd test-reweight-by-utilization") {
+ bool by_pg =
+ prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
+ bool dry_run =
+ prefix == "osd test-reweight-by-pg" ||
+ prefix == "osd test-reweight-by-utilization";
int64_t oload;
cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
set<int64_t> pools;
}
pools.insert(pool);
}
+ double max_change = g_conf->mon_reweight_max_change;
+ cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
+ if (max_change <= 0.0) {
+ ss << "max_change " << max_change << " must be positive";
+ err = -EINVAL;
+ goto reply;
+ }
+ int64_t max_osds = g_conf->mon_reweight_max_osds;
+ cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
+ string no_increasing;
+ cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
string out_str;
- err = reweight_by_utilization(oload, out_str, true,
- pools.empty() ? NULL : &pools);
+ err = reweight_by_utilization(oload,
+ max_change,
+ max_osds,
+ by_pg,
+ pools.empty() ? NULL : &pools,
+ no_increasing == "--no-increasing",
+ dry_run,
+ &ss, &out_str, f.get());
+ if (f)
+ f->flush(rdata);
+ else
+ rdata.append(out_str);
if (err < 0) {
- ss << "FAILED reweight-by-pg: " << out_str;
+ ss << "FAILED reweight-by-pg";
} else if (err == 0) {
- ss << "no change: " << out_str;
+ ss << "no change";
} else {
- ss << "SUCCESSFUL reweight-by-pg: " << out_str;
- getline(ss, rs);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
- get_last_committed() + 1));
+ ss << "SUCCESSFUL reweight-by-pg";
+ wait_for_finished_proposal(
+ new Monitor::C_Command(mon, m, 0, rs, rdata, get_last_committed() + 1));
return true;
}
} else if (prefix == "osd thrash") {
void send_incremental(PaxosServiceMessage *m, epoch_t first);
void send_incremental(epoch_t first, MonSession *session, bool onetime);
- int reweight_by_utilization(int oload, std::string& out_str, bool by_pg,
- const set<int64_t> *pools);
-
+ int reweight_by_utilization(int oload,
+ double max_change,
+ int max_osds,
+ bool by_pg,
+ const set<int64_t> *pools,
+ bool no_increasing,
+ bool dry_run,
+ std::stringstream *ss,
+ std::string *out_str,
+ Formatter *f);
void print_utilization(ostream &out, Formatter *f, bool tree) const;
bool check_source(PaxosServiceMessage *m, uuid_d fsid);
//void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
object_stat_sum_t &sum, uint64_t avail,
- bool verbose)
+ float raw_used_rate, bool verbose)
{
+ float curr_object_copies_rate = 0.0;
+ if (sum.num_object_copies > 0)
+ curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
+
if (f) {
f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
f->dump_int("bytes_used", sum.num_bytes);
f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
f->dump_int("wr", sum.num_wr);
f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
+ f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
}
} else {
tbl << stringify(si_t(sum.num_bytes));
int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
float used = 0.0;
if (pg_map.osd_sum.kb > 0)
- used = (float)kb_used / pg_map.osd_sum.kb;
+ used = (float)kb_used * raw_used_rate * curr_object_copies_rate / pg_map.osd_sum.kb;
tbl << percentify(used*100);
tbl << si_t(avail);
tbl << sum.num_objects;
if (verbose) {
tbl << stringify(si_t(sum.num_objects_dirty))
- << stringify(si_t(sum.num_rd))
- << stringify(si_t(sum.num_wr));
+ << stringify(si_t(sum.num_rd))
+ << stringify(si_t(sum.num_wr))
+ << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
}
}
}
tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
}
}
pool->get_type(),
pool->get_size());
int64_t avail;
+ float raw_used_rate;
if (avail_by_rule.count(ruleno) == 0) {
avail = get_rule_avail(osdmap, ruleno);
if (avail < 0)
switch (pool->get_type()) {
case pg_pool_t::TYPE_REPLICATED:
avail /= pool->get_size();
+ raw_used_rate = pool->get_size();
break;
case pg_pool_t::TYPE_ERASURE:
- {
- const map<string,string>& ecp =
- osdmap.get_erasure_code_profile(pool->erasure_code_profile);
- map<string,string>::const_iterator pm = ecp.find("m");
- map<string,string>::const_iterator pk = ecp.find("k");
- if (pm != ecp.end() && pk != ecp.end()) {
- int k = atoi(pk->second.c_str());
- int m = atoi(pm->second.c_str());
- avail = avail * k / (m + k);
- }
+ {
+ const map<string,string>& ecp =
+ osdmap.get_erasure_code_profile(pool->erasure_code_profile);
+ map<string,string>::const_iterator pm = ecp.find("m");
+ map<string,string>::const_iterator pk = ecp.find("k");
+ if (pm != ecp.end() && pk != ecp.end()) {
+ int k = atoi(pk->second.c_str());
+ int m = atoi(pm->second.c_str());
+ avail = avail * k / (m + k);
+ raw_used_rate = (float)(m + k) / k;
+ } else {
+ raw_used_rate = 0.0;
}
break;
+ }
default:
assert(0 == "unrecognized pool type");
}
if (verbose)
tbl << "-";
}
- dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
+ dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose);
if (f)
f->close_section(); // stats
else
vector<string>& args) const;
void dump_object_stat_sum(TextTable &tbl, Formatter *f,
- object_stat_sum_t &sum,
+ object_stat_sum_t &sum,
uint64_t avail,
+ float raw_used_rate,
bool verbose);
int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
* held by somebody trying to make use of the SimpleMessenger lock.
* So drop locks, wait, and retry. It just looks like a slow network
* to everybody else.
+ *
+ * We take a ref to existing here since it might get reaped before we
+ * wake up (see bug #15870). We can be confident that it lived until
+ * locked it since we held the msgr lock from _lookup_pipe through to
+ * locking existing->lock and checking reader_dispatching.
*/
+ existing->get();
pipe_lock.Unlock();
msgr->lock.Unlock();
existing->notify_on_dispatch_done = true;
while (existing->reader_dispatching)
existing->cond.Wait(existing->pipe_lock);
existing->pipe_lock.Unlock();
+ existing->put();
+ existing = 0;
goto retry_existing_lookup;
}
{
int flags, ret;
- if (aio && !directio) {
- derr << "FileJournal::_open: aio not supported without directio; disabling aio" << dendl;
- aio = false;
- }
-#ifndef HAVE_LIBAIO
- if (aio) {
- derr << "FileJournal::_open: libaio not compiled in; disabling aio" << dendl;
- aio = false;
- }
-#endif
-
if (forwrite) {
flags = O_RDWR;
if (directio)
return 0;
}
+// This can not be used on an active journal
int FileJournal::check()
{
int ret;
+ assert(fd == -1);
ret = _open(false, false);
if (ret)
- goto done;
+ return ret;
- ret = read_header();
+ ret = read_header(&header);
if (ret < 0)
goto done;
ret = 0;
done:
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- fd = -1;
+ close();
return ret;
}
header.start = get_top();
header.start_seq = 0;
- print_header();
+ print_header(header);
// static zeroed buffer for alignment padding
delete [] zero_buf;
return ret;
}
+// This can not be used on an active journal
int FileJournal::peek_fsid(uuid_d& fsid)
{
+ assert(fd == -1);
int r = _open(false, false);
if (r)
return r;
- r = read_header();
+ r = read_header(&header);
if (r < 0)
- return r;
+ goto out;
fsid = header.fsid;
- return 0;
+out:
+ close();
+ return r;
}
int FileJournal::open(uint64_t fs_op_seq)
write_pos = get_top();
// read header?
- err = read_header();
+ err = read_header(&header);
if (err < 0)
return err;
return 0;
}
+void FileJournal::_close(int fd) const
+{
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
void FileJournal::close()
{
dout(1) << "close " << fn << dendl;
assert(writeq_empty());
assert(!must_write_header);
assert(fd >= 0);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
+ _close(fd);
fd = -1;
}
int FileJournal::dump(ostream& out)
{
- int err = 0;
+ return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+ return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+ JSONFormatter f(true);
+ int ret = _fdump(f, simple);
+ f.flush(out);
+ return ret;
+}
+
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+ dout(10) << "_fdump" << dendl;
- dout(10) << "dump" << dendl;
- err = _open(false, false);
+ assert(fd == -1);
+ int err = _open(false, false);
if (err)
return err;
- err = read_header();
- if (err < 0)
+ err = read_header(&header);
+ if (err < 0) {
+ close();
return err;
+ }
- read_pos = header.start;
+ off64_t next_pos = header.start;
- JSONFormatter f(true);
+ f.open_object_section("journal");
- f.open_array_section("journal");
- uint64_t seq = 0;
+ f.open_object_section("header");
+ f.dump_unsigned("flags", header.flags);
+ ostringstream os;
+ os << header.fsid;
+ f.dump_string("fsid", os.str());
+ f.dump_unsigned("block_size", header.block_size);
+ f.dump_unsigned("alignment", header.alignment);
+ f.dump_int("max_size", header.max_size);
+ f.dump_int("start", header.start);
+ f.dump_unsigned("committed_up_to", header.committed_up_to);
+ f.dump_unsigned("start_seq", header.start_seq);
+ f.close_section();
+
+ f.open_array_section("entries");
+ uint64_t seq = header.start_seq;
while (1) {
bufferlist bl;
- uint64_t pos = read_pos;
- if (!read_entry(bl, seq)) {
- dout(3) << "journal_replay: end of journal, done." << dendl;
+ off64_t pos = next_pos;
+
+ if (!pos) {
+ dout(2) << "_dump -- not readable" << dendl;
+ return false;
+ }
+ stringstream ss;
+ read_entry_result result = do_read_entry(
+ pos,
+ &next_pos,
+ &bl,
+ &seq,
+ &ss);
+ if (result != SUCCESS) {
+ if (seq < header.committed_up_to) {
+ dout(2) << "Unable to read past sequence " << seq
+ << " but header indicates the journal has committed up through "
+ << header.committed_up_to << ", journal is corrupt" << dendl;
+ err = EINVAL;
+ }
+ dout(25) << ss.str() << dendl;
+ dout(25) << "No further valid entries found, journal is most likely valid"
+ << dendl;
break;
}
f.open_object_section("entry");
f.dump_unsigned("offset", pos);
f.dump_unsigned("seq", seq);
- f.open_array_section("transactions");
- bufferlist::iterator p = bl.begin();
- int trans_num = 0;
- while (!p.end()) {
- ObjectStore::Transaction *t = new ObjectStore::Transaction(p);
- f.open_object_section("transaction");
- f.dump_unsigned("trans_num", trans_num);
- t->dump(&f);
+ if (simple) {
+ f.dump_unsigned("bl.length", bl.length());
+ } else {
+ f.open_array_section("transactions");
+ bufferlist::iterator p = bl.begin();
+ int trans_num = 0;
+ while (!p.end()) {
+ ObjectStore::Transaction *t = new ObjectStore::Transaction(p);
+ f.open_object_section("transaction");
+ f.dump_unsigned("trans_num", trans_num);
+ t->dump(&f);
+ f.close_section();
+ delete t;
+ trans_num++;
+ }
f.close_section();
- delete t;
- trans_num++;
}
f.close_section();
- f.close_section();
- f.flush(cout);
}
+ f.close_section();
f.close_section();
dout(10) << "dump finish" << dendl;
- return 0;
+
+ close();
+ return err;
}
void FileJournal::stop_writer()
{
+ // Do nothing if writer already stopped or never started
+ if (!write_stop)
{
- Mutex::Locker l(write_lock);
- Mutex::Locker p(writeq_lock);
- write_stop = true;
- writeq_cond.Signal();
- // Doesn't hurt to signal commit_cond in case thread is waiting there
- // and caller didn't use committed_thru() first.
- commit_cond.Signal();
+ {
+ Mutex::Locker l(write_lock);
+ Mutex::Locker p(writeq_lock);
+ write_stop = true;
+ writeq_cond.Signal();
+ // Doesn't hurt to signal commit_cond in case thread is waiting there
+ // and caller didn't use committed_thru() first.
+ commit_cond.Signal();
+ }
+ write_thread.join();
+
+ // write journal header now so that we have less to replay on remount
+ write_header_sync();
}
- write_thread.join();
#ifdef HAVE_LIBAIO
// stop aio completeion thread *after* writer thread has stopped
// and has submitted all of its io
- if (aio) {
+ if (aio && !aio_stop) {
aio_lock.Lock();
aio_stop = true;
aio_cond.Signal();
-void FileJournal::print_header()
+void FileJournal::print_header(const header_t &header) const
{
dout(10) << "header: block_size " << header.block_size
<< " alignment " << header.alignment
dout(10) << " write_pos " << write_pos << dendl;
}
-int FileJournal::read_header()
+int FileJournal::read_header(header_t *hdr) const
{
dout(10) << "read_header" << dendl;
bufferlist bl;
try {
bufferlist::iterator p = bl.begin();
- ::decode(header, p);
+ ::decode(*hdr, p);
}
catch (buffer::error& e) {
derr << "read_header error decoding journal header" << dendl;
* remove this or else this (eventually old) code will clobber newer
* code's flags.
*/
- if (header.flags > 3) {
+ if (hdr->flags > 3) {
derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
- header.flags = 0;
+ hdr->flags = 0;
}
- print_header();
+ print_header(*hdr);
return 0;
}
return bp;
}
-
+void FileJournal::write_header_sync()
+{
+ Mutex::Locker locker(write_lock);
+ must_write_header = true;
+ bufferlist bl;
+ do_write(bl);
+ dout(20) << __func__ << " finish" << dendl;
+}
int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
{
put_throttle(1, peek_write().bl.length());
pop_write();
}
- print_header();
+ print_header(header);
}
return -ENOSPC; // hrm, full on first op
put_throttle(1, peek_write().bl.length());
pop_write();
}
- print_header();
+ print_header(header);
r = 0;
} else {
dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
}
must_write_header = true;
- print_header();
+ print_header(header);
// committed but unjournaled items
while (!writeq_empty() && peek_write().seq <= seq) {
int64_t olen,
bufferlist* bl,
off64_t *out_pos
- )
+ ) const
{
while (olen > 0) {
while (pos >= header.max_size)
&seq,
&ss);
if (result == SUCCESS) {
+ journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
if (next_seq > seq) {
return false;
} else {
}
stringstream errss;
- if (seq < header.committed_up_to) {
+ if (seq && seq < header.committed_up_to) {
derr << "Unable to read past sequence " << seq
<< " but header indicates the journal has committed up through "
<< header.committed_up_to << ", journal is corrupt" << dendl;
bufferlist *bl,
uint64_t *seq,
ostream *ss,
- entry_header_t *_h)
+ entry_header_t *_h) const
{
off64_t cur_pos = init_pos;
bufferlist _bl;
if (seq)
*seq = h->seq;
- // works around an apparent GCC 4.8(?) compiler bug about unaligned
- // bind by reference to (packed) h->seq
- journalq.push_back(
- pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
- static_cast<off64_t>(init_pos)));
if (next_pos)
*next_pos = cur_pos;
start = block_size;
}
- uint64_t get_fsid64() {
+ uint64_t get_fsid64() const {
return *(uint64_t*)&fsid.uuid[0];
}
}
} __attribute__((__packed__, aligned(4)));
+ bool journalq_empty() { return journalq.empty(); }
+
private:
string fn;
int _open(bool wr, bool create=false);
int _open_block_device();
+ void _close(int fd) const;
void _check_disk_write_cache() const;
int _open_file(int64_t oldsize, blksize_t blksize, bool create);
- void print_header();
- int read_header();
+ int _dump(ostream& out, bool simple);
+ void print_header(const header_t &hdr) const;
+ int read_header(header_t *hdr) const;
bufferptr prepare_header();
void start_writer();
void stop_writer();
int64_t len, ///< [in] length to read
bufferlist* bl, ///< [out] result
off64_t *out_pos ///< [out] next position to read, will be wrapped
- );
+ ) const;
void do_discard(int64_t offset, int64_t end);
}
} write_finish_thread;
- off64_t get_top() {
+ off64_t get_top() const {
return ROUND_UP_TO(sizeof(header), block_size);
}
throttle_ops(g_ceph_context, "filestore_ops", g_conf->journal_queue_max_ops),
throttle_bytes(g_ceph_context, "filestore_bytes", g_conf->journal_queue_max_bytes),
write_lock("FileJournal::write_lock", false, true, false, g_ceph_context),
- write_stop(false),
- aio_stop(false),
+ write_stop(true),
+ aio_stop(true),
write_thread(this),
- write_finish_thread(this) { }
+ write_finish_thread(this) {
+
+ if (aio && !directio) {
+ derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+ aio = false;
+ }
+#ifndef HAVE_LIBAIO
+ if (aio) {
+ derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+ aio = false;
+ }
+#endif
+ }
~FileJournal() {
+ assert(fd == -1);
delete[] zero_buf;
}
int peek_fsid(uuid_d& fsid);
int dump(ostream& out);
+ int simple_dump(ostream& out);
+ int _fdump(Formatter &f, bool simple);
void flush();
return full_state != FULL_NOTFULL && !write_stop;
}
+ void write_header_sync();
+
void set_wait_on_full(bool b) { wait_on_full = b; }
// reads
uint64_t *seq, ///< [out] seq of successful read
ostream *ss, ///< [out] error output
entry_header_t *h = 0 ///< [out] header
- ); ///< @return result code
+ ) const; ///< @return result code
bool read_entry(
bufferlist &bl,
if (r < 0)
goto out;
r = lfn_created(path_comp, oid, short_name);
- if (r < 0)
+ if (r < 0) {
+ if (failed) {
+ /* This is hacky, but the only way we get ENOENT from lfn_created here is
+ * if we did a failure injection in _created below AND actually started the
+ * split or merge. In that case, lfn_created already suceeded, and
+ * WRAP_RETRY already cleaned it up and we are actually done. In a real
+ * failure, the filestore itself would have ended up calling this with
+ * the new path, not the old one, so we'd find it.
+ */
+ r = 0;
+ }
goto out;
+ }
r = _created(path_comp, oid, short_name);
if (r < 0)
goto out;
if (!lfn_is_hashed_filename(short_name)) {
return lfn_parse_object_name(short_name, out);
}
- // Get lfn_attr
string full_path = get_full_path(path, short_name);
char attr[PATH_MAX];
- int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1);
+ // First, check alt attr
+ int r = chain_getxattr(
+ full_path.c_str(),
+ get_alt_lfn_attr().c_str(),
+ attr,
+ sizeof(attr) - 1);
+ if (r >= 0) {
+ // There is an alt attr, does it match?
+ if (r < (int)sizeof(attr))
+ attr[r] = '\0';
+ if (short_name_matches(short_name.c_str(), attr)) {
+ string long_name(attr);
+ return lfn_parse_object_name(long_name, out);
+ }
+ }
+
+ // Get lfn_attr
+ r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1);
if (r < 0)
return -errno;
if (r < (int)sizeof(attr))
}
}
+bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name)
+{
+ const char *end = short_name;
+ while (*end) ++end;
+ const char *suffix = end;
+ if (suffix > short_name) --suffix; // last char
+ while (suffix > short_name && *suffix != '_') --suffix; // back to first _
+ if (suffix > short_name) --suffix; // one behind that
+ while (suffix > short_name && *suffix != '_') --suffix; // back to second _
+
+ int index = -1;
+ char buf[FILENAME_SHORT_LEN + 4];
+ assert((end - suffix) < (int)sizeof(buf));
+ int r = sscanf(suffix, "_%d_%s", &index, buf);
+ if (r < 2)
+ return false;
+ if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0)
+ return false;
+ build_filename(cand_long_name, index, buf, sizeof(buf));
+ return strcmp(short_name, buf) == 0;
+}
+
string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
{
string long_name = lfn_generate_object_name(oid);
const string &attr ///< [in] Attribute to mangle.
); ///< @return Mangled attribute name.
+ /// checks whether long_name could hash to short_name
+ bool short_name_matches(
+ const char *short_name, ///< [in] name to check against
+ const char *cand_long_name ///< [in] candidate long name
+ );
+
/// Builds hashed filename
void build_filename(
const char *old_filename, ///< [in] Filename to convert.
return -EINVAL;
}
- if (g_conf->leveldb_compact_on_mount) {
- derr << "Compacting leveldb store..." << dendl;
- compact();
- derr << "Finished compacting leveldb store" << dendl;
- }
-
PerfCountersBuilder plb(g_ceph_context, "leveldb", l_leveldb_first, l_leveldb_last);
plb.add_u64_counter(l_leveldb_gets, "leveldb_get");
plb.add_u64_counter(l_leveldb_txns, "leveldb_transaction");
plb.add_u64(l_leveldb_compact_queue_len, "leveldb_compact_queue_len");
logger = plb.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
+
+ if (g_conf->leveldb_compact_on_mount) {
+ derr << "Compacting leveldb store..." << dendl;
+ compact();
+ derr << "Finished compacting leveldb store" << dendl;
+ }
return 0;
}
from[i->first.shard].claim(i->second);
}
dout(10) << __func__ << ": " << from << dendl;
- ECUtil::decode(sinfo, ec_impl, from, target);
+ if (ECUtil::decode(sinfo, ec_impl, from, target) != 0) {
+ derr << __func__ << ": inconsistent shard sizes " << hoid << " "
+ << " the offending shard must be manually removed "
+ << " after verifying there are enough shards to recover "
+ << "(" << to_read.get<0>()
+ << ", " << to_read.get<1>()
+ << ", " << to_read.get<2>()
+ << ")"
+ << dendl;
+ assert(0);
+ }
if (attrs) {
op.xattrs.swap(*attrs);
op->on_local_applied_sync = 0;
} else {
MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
- r->set_priority(cct->_conf->osd_client_op_priority);
r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
r->map_epoch = get_parent()->get_epoch();
get_parent()->send_message_osd_cluster(
c)));
start_read_op(
- cct->_conf->osd_client_op_priority,
+ CEPH_MSG_PRIO_DEFAULT,
for_read_op,
OpRequestRef());
return;
for (map<int, bufferlist>::iterator i = to_decode.begin();
i != to_decode.end();
++i) {
- assert(i->second.length() == total_chunk_size);
+ if (i->second.length() != total_chunk_size)
+ return -EINVAL;
}
if (total_chunk_size == 0)
pg_epoch_lock("OSDService::pg_epoch_lock"),
publish_lock("OSDService::publish_lock"),
pre_publish_lock("OSDService::pre_publish_lock"),
+ max_oldest_map(0),
peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
scrubs_active(0),
OSDSuperblock& sblock)
{
MOSDMap *m = new MOSDMap(monc->get_fsid());
- m->oldest_map = sblock.oldest_map;
+ m->oldest_map = max_oldest_map.read();
m->newest_map = sblock.newest_map;
for (epoch_t e = to; e > since; e--) {
if (since < sblock.oldest_map) {
// just send latest full map
MOSDMap *m = new MOSDMap(monc->get_fsid());
- m->oldest_map = sblock.oldest_map;
+ m->oldest_map = max_oldest_map.read();
m->newest_map = sblock.newest_map;
get_map_bl(to, m->maps[to]);
send_map(m, con);
dout(2) << "boot" << dendl;
+ int rotating_auth_attempts = 0;
+ const int max_rotating_auth_attempts = 10;
+
// read superblock
r = read_superblock();
if (r < 0) {
service.init();
service.publish_map(osdmap);
service.publish_superblock(superblock);
+ service.max_oldest_map.set(superblock.oldest_map);
osd_lock.Unlock();
while (monc->wait_auth_rotating(30.0) < 0) {
derr << "unable to obtain rotating service keys; retrying" << dendl;
+ ++rotating_auth_attempts;
+ if (rotating_auth_attempts > max_rotating_auth_attempts) {
+ osd_lock.Lock(); // make locker happy
+ if (!is_stopping()) {
+ r = - ETIMEDOUT;
+ }
+ goto monout;
+ }
}
osd_lock.Lock();
test_ops_hook,
"inject metadata error");
assert(r == 0);
+ r = admin_socket->register_command(
+ "set_recovery_delay",
+ "set_recovery_delay " \
+ "name=utime,type=CephInt,req=false",
+ test_ops_hook,
+ "Delay osd recovery by specified seconds");
+ assert(r == 0);
}
void OSD::create_logger()
cct->get_admin_socket()->unregister_command("truncobj");
cct->get_admin_socket()->unregister_command("injectdataerr");
cct->get_admin_socket()->unregister_command("injectmdataerr");
+ cct->get_admin_socket()->unregister_command("set_recovery_delay");
delete test_ops_hook;
test_ops_hook = NULL;
derr << __func__ << ": could not find map for epoch " << map_epoch
<< " on pg " << pgid << ", but the pool is not present in the "
<< "current map, so this is probably a result of bug 10617. "
- << "Skipping the pg for now, you can use ceph_objectstore_tool "
+ << "Skipping the pg for now, you can use ceph-objectstore-tool "
<< "to clean it up later." << dendl;
continue;
} else {
PG *pg = i->second;
epoch_t start, end;
- if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map))
+ if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map)) {
+ if (pg->info.history.same_interval_since == 0)
+ pg->info.history.same_interval_since = end;
continue;
+ }
dout(10) << pg->info.pgid << " needs " << start << "-" << end << dendl;
pistate& p = pis[pg];
}
}
+ // Now that past_intervals have been recomputed let's fix the same_interval_since
+ // if it was cleared by import.
+ for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
+ PG *pg = i->first;
+ pistate& p = i->second;
+
+ if (pg->info.history.same_interval_since == 0) {
+ assert(p.same_interval_since);
+ dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
+ dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
+ // Fix it
+ pg->info.history.same_interval_since = p.same_interval_since;
+ }
+ }
+
// write info only at the end. this is necessary because we check
// whether the past_intervals go far enough back or forward in time,
// but we don't check for holes. we could avoid it by discarding
// truncobj <pool-id> [namespace/]<obj-name> <newlen>
// injectmdataerr [namespace/]<obj-name>
// injectdataerr [namespace/]<obj-name>
+//
+// set_recovery_delay [utime]
void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
std::string command, cmdmap_t& cmdmap, ostream &ss)
{
}
return;
}
+ if (command == "set_recovery_delay") {
+ int64_t delay;
+ cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
+ ostringstream oss;
+ oss << delay;
+ int r = service->cct->_conf->set_val("osd_recovery_delay_start",
+ oss.str().c_str());
+ if (r != 0) {
+ ss << "set_recovery_delay: error setting "
+ << "osd_recovery_delay_start to '" << delay << "': error "
+ << r;
+ return;
+ }
+ service->cct->_conf->apply_changes(NULL);
+ ss << "set_recovery_delay: set osd_recovery_delay_start "
+ << "to " << service->cct->_conf->osd_recovery_delay_start;
+ return;
+ }
ss << "Internal error - command=" << command;
return;
}
} else {
register_session_waiting_on_map(session);
}
+ session->maybe_reset_osdmap();
}
assert(session->session_dispatch_lock.is_locked());
update_waiting_for_pg(session, osdmap);
session->waiting_for_pg.erase(pgid);
+ session->maybe_reset_osdmap();
clear_session_waiting_on_pg(session, pgid);
}
}
}
+void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
+{
+ epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
+ if (superblock.oldest_map >= min)
+ return;
+
+ int num = 0;
+ ObjectStore::Transaction *t = NULL;
+ for (epoch_t e = superblock.oldest_map; e < min; ++e) {
+ dout(20) << " removing old osdmap epoch " << e << dendl;
+ if (!t) {
+ t = new ObjectStore::Transaction;
+ }
+ t->remove(META_COLL, get_osdmap_pobject_name(e));
+ t->remove(META_COLL, get_inc_osdmap_pobject_name(e));
+ superblock.oldest_map = e + 1;
+ num++;
+ if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
+ service.publish_superblock(superblock);
+ write_superblock(*t);
+ store->queue_transaction_and_cleanup(NULL, t);
+ t = NULL;
+ num = 0;
+ if (!skip_maps) {
+ // skip_maps leaves us with a range of old maps if we fail to remove all
+ // of them before moving superblock.oldest_map forward to the first map
+ // in the incoming MOSDMap msg. so we should continue removing them in
+ // this case, even we could do huge series of delete transactions all at
+ // once.
+ break;
+ }
+ }
+ }
+ if (num > 0) {
+ service.publish_superblock(superblock);
+ write_superblock(*t);
+ store->queue_transaction_and_cleanup(NULL, t);
+ }
+ // we should not remove the cached maps
+ assert(min <= service.map_cache.cached_key_lower_bound());
+}
+
void OSD::handle_osd_map(MOSDMap *m)
{
assert(osd_lock.is_locked());
logger->inc(l_osd_mape, last - first + 1);
if (first <= osdmap->get_epoch())
logger->inc(l_osd_mape_dup, osdmap->get_epoch() - first + 1);
+ if (service.max_oldest_map.read() < m->oldest_map) {
+ service.max_oldest_map.set(m->oldest_map);
+ assert(service.max_oldest_map.read() >= superblock.oldest_map);
+ }
// make sure there is something new, here, before we bother flushing the queues and such
if (last <= osdmap->get_epoch()) {
<< " but failed to encode full with correct crc; requesting"
<< dendl;
clog->warn() << "failed to encode map e" << e << " with expected crc\n";
+ dout(20) << "my encoded map was:\n";
+ fbl.hexdump(*_dout);
+ *_dout << dendl;
delete o;
MMonGetOSDMap *req = new MMonGetOSDMap;
req->request_full(e, last);
}
if (superblock.oldest_map) {
- int num = 0;
- epoch_t min(
- MIN(m->oldest_map,
- service.map_cache.cached_key_lower_bound()));
- for (epoch_t e = superblock.oldest_map; e < min; ++e) {
- dout(20) << " removing old osdmap epoch " << e << dendl;
- t.remove(META_COLL, get_osdmap_pobject_name(e));
- t.remove(META_COLL, get_inc_osdmap_pobject_name(e));
- superblock.oldest_map = e+1;
- num++;
- if (num >= cct->_conf->osd_target_transaction_size &&
- (uint64_t)num > (last - first)) // make sure we at least keep pace with incoming maps
- break;
- }
+ // make sure we at least keep pace with incoming maps
+ trim_maps(m->oldest_map, last - first + 1, skip_maps);
}
if (!superblock.oldest_map || skip_maps)
for (set<spg_t>::iterator p = pgs_to_check.begin();
p != pgs_to_check.end();
++p) {
- vector<int> acting;
- int nrep = osdmap->pg_to_acting_osds(p->pgid, acting);
- int role = osdmap->calc_pg_role(whoami, acting, nrep);
-
- if (role < 0) {
+ if (!(osdmap->is_acting_osd_shard(p->pgid, whoami, p->shard))) {
set<Session*> concerned_sessions;
get_sessions_possibly_interested_in_pg(*p, &concerned_sessions);
for (set<Session*>::iterator i = concerned_sessions.begin();
int get_nodeid() const { return whoami; }
+ atomic_t max_oldest_map;
OSDMapRef osdmap;
OSDMapRef get_osdmap() {
Mutex::Locker l(publish_lock);
sent_epoch_lock("Session::sent_epoch_lock"), last_sent_epoch(0),
received_map_lock("Session::received_map_lock"), received_map_epoch(0)
{}
-
-
+ void maybe_reset_osdmap() {
+ if (waiting_for_pg.empty()) {
+ osdmap.reset();
+ }
+ }
};
void update_waiting_for_pg(Session *session, OSDMapRef osdmap);
void session_notify_pg_create(Session *session, OSDMapRef osdmap, spg_t pgid);
*/
session->waiting_on_map.clear();
session->waiting_for_pg.clear();
+ session->osdmap.reset();
}
void register_session_waiting_on_pg(Session *session, spg_t pgid) {
Mutex::Locker l(session_waiting_lock);
void wait_for_new_map(OpRequestRef op);
void handle_osd_map(class MOSDMap *m);
+ void trim_maps(epoch_t oldest, int nreceived, bool skip_maps);
void note_down_osd(int osd);
void note_up_osd(int osd);
// require the crush_v2 feature of clients
return 0;
}
+
+int OSDMap::summarize_mapping_stats(
+ OSDMap *newmap,
+ const set<int64_t> *pools,
+ std::string *out,
+ Formatter *f) const
+{
+ set<int64_t> ls;
+ if (pools) {
+ ls = *pools;
+ } else {
+ for (map<int64_t, pg_pool_t>::const_iterator i = get_pools().begin();
+ i != get_pools().end();
+ ++i) {
+ ls.insert(i->first);
+ }
+ }
+
+ unsigned total_pg = 0;
+ unsigned moved_pg = 0;
+ vector<unsigned> base_by_osd(get_max_osd(), 0);
+ vector<unsigned> new_by_osd(get_max_osd(), 0);
+ for (set<int64_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ int64_t pool_id = *p;
+ const pg_pool_t *pi = get_pg_pool(pool_id);
+ vector<int> up, up2, acting;
+ int up_primary, acting_primary;
+ for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+ pg_t pgid(ps, pool_id, -1);
+ total_pg += pi->get_size();
+ pg_to_up_acting_osds(pgid, &up, &up_primary,
+ &acting, &acting_primary);
+ for (vector<int>::iterator q = up.begin(); q != up.end(); ++q) {
+ int osd = *q;
+ if (osd >= 0 && osd < get_max_osd())
+ ++base_by_osd[osd];
+ }
+ if (newmap) {
+ newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary,
+ &acting, &acting_primary);
+ for (vector<int>::iterator q = up2.begin(); q != up2.end(); ++q) {
+ int osd = *q;
+ if (osd >= 0 && osd < get_max_osd())
+ ++new_by_osd[osd];
+ }
+ if (pi->type == pg_pool_t::TYPE_ERASURE) {
+ for (unsigned i=0; i<up.size(); ++i) {
+ if (up[i] != up2[i]) {
+ ++moved_pg;
+ }
+ }
+ } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
+ for (vector<int>::iterator q = up.begin(); q != up.end(); ++q) {
+ int osd = *q;
+ if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
+ ++moved_pg;
+ }
+ }
+ } else {
+ assert(0 == "unhandled pool type");
+ }
+ }
+ }
+ }
+
+ unsigned num_up_in = 0;
+ for (int osd = 0; osd < get_max_osd(); ++osd) {
+ if (is_up(osd) && is_in(osd))
+ ++num_up_in;
+ }
+ if (!num_up_in) {
+ return -EINVAL;
+ }
+
+ float avg_pg = (float)total_pg / (float)num_up_in;
+ float base_stddev = 0, new_stddev = 0;
+ int min = -1, max = -1;
+ unsigned min_base_pg = 0, max_base_pg = 0;
+ unsigned min_new_pg = 0, max_new_pg = 0;
+ for (int osd = 0; osd < get_max_osd(); ++osd) {
+ if (is_up(osd) && is_in(osd)) {
+ float base_diff = (float)base_by_osd[osd] - avg_pg;
+ base_stddev += base_diff * base_diff;
+ float new_diff = (float)new_by_osd[osd] - avg_pg;
+ new_stddev += new_diff * new_diff;
+ if (min < 0 || min_base_pg < base_by_osd[osd]) {
+ min = osd;
+ min_base_pg = base_by_osd[osd];
+ min_new_pg = new_by_osd[osd];
+ }
+ if (max < 0 || max_base_pg > base_by_osd[osd]) {
+ max = osd;
+ max_base_pg = base_by_osd[osd];
+ max_new_pg = new_by_osd[osd];
+ }
+ }
+ }
+ base_stddev = sqrt(base_stddev / num_up_in);
+ new_stddev = sqrt(new_stddev / num_up_in);
+
+ float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
+
+ ostringstream ss;
+ if (f)
+ f->open_object_section("utilization");
+ if (newmap) {
+ if (f) {
+ f->dump_unsigned("moved_pgs", moved_pg);
+ f->dump_unsigned("total_pgs", total_pg);
+ } else {
+ ss << "moved " << moved_pg << " / " << total_pg
+ << " (" << ((float)moved_pg * 100.0 / (float)total_pg) << "%)\n";
+ }
+ }
+ if (f) {
+ f->dump_float("avg_pgs", avg_pg);
+ f->dump_float("std_dev", base_stddev);
+ f->dump_float("expected_baseline_std_dev", edev);
+ if (newmap)
+ f->dump_float("new_std_dev", new_stddev);
+ } else {
+ ss << "avg " << avg_pg << "\n";
+ ss << "stddev " << base_stddev;
+ if (newmap)
+ ss << " -> " << new_stddev;
+ ss << " (expected baseline " << edev << ")\n";
+ }
+ if (min >= 0) {
+ if (f) {
+ f->dump_unsigned("min_osd", min);
+ f->dump_unsigned("min_osd_pgs", min_base_pg);
+ if (newmap)
+ f->dump_unsigned("new_min_osd_pgs", min_new_pg);
+ } else {
+ ss << "min osd." << min << " with " << min_base_pg;
+ if (newmap)
+ ss << " -> " << min_new_pg;
+ ss << " pgs (" << (float)min_base_pg / avg_pg;
+ if (newmap)
+ ss << " -> " << (float)min_new_pg / avg_pg;
+ ss << " * mean)\n";
+ }
+ }
+ if (max >= 0) {
+ if (f) {
+ f->dump_unsigned("max_osd", max);
+ f->dump_unsigned("max_osd_pgs", max_base_pg);
+ if (newmap)
+ f->dump_unsigned("new_max_osd_pgs", max_new_pg);
+ } else {
+ ss << "max osd." << max << " with " << max_base_pg;
+ if (newmap)
+ ss << " -> " << max_new_pg;
+ ss << " pgs (" << (float)max_base_pg / avg_pg;
+ if (newmap)
+ ss << " -> " << (float)max_new_pg / avg_pg;
+ ss << " * mean)\n";
+ }
+ }
+ if (f)
+ f->close_section();
+ if (out)
+ *out = ss.str();
+ return 0;
+}
pg_temp.reset(new map<pg_t,vector<int32_t> >(*o.pg_temp));
osd_uuid.reset(new vector<uuid_d>(*o.osd_uuid));
+ if (o.osd_primary_affinity)
+ osd_primary_affinity.reset(new vector<__u32>(*o.osd_primary_affinity));
+
// NOTE: this still references shared entity_addr_t's.
osd_addrs.reset(new addrs_s(*o.osd_addrs));
return group[group.size()-1];
return -1; // we fail!
}
+ bool is_acting_osd_shard(pg_t pg, int osd, shard_id_t shard) const {
+ vector<int> acting;
+ int nrep = pg_to_acting_osds(pg, acting);
+ if (shard == shard_id_t::NO_SHARD)
+ return calc_pg_role(osd, acting, nrep) >= 0;
+ if (shard >= (int)acting.size())
+ return false;
+ return acting[shard] == osd;
+ }
/* what replica # is a given osd? 0 primary, -1 for none. */
void print_oneline_summary(ostream& out) const;
void print_tree(ostream *out, Formatter *f) const;
+ int summarize_mapping_stats(
+ OSDMap *newmap,
+ const set<int64_t> *pools,
+ std::string *out,
+ Formatter *f) const;
+
string get_flag_string() const;
static string get_flag_string(unsigned flags);
static void dump_erasure_code_profiles(const map<string,map<string,string> > &profiles,
name = map->get_pool_name(id);
if (pi->get_snap_epoch() == map->get_epoch()) {
pi->build_removed_snaps(newly_removed_snaps);
- newly_removed_snaps.subtract(cached_removed_snaps);
- cached_removed_snaps.union_of(newly_removed_snaps);
+ interval_set<snapid_t> intersection;
+ intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
+ if (intersection == cached_removed_snaps) {
+ newly_removed_snaps.subtract(cached_removed_snaps);
+ cached_removed_snaps.union_of(newly_removed_snaps);
+ } else {
+ lgeneric_subdout(g_ceph_context, osd, 0) << __func__
+ << " cached_removed_snaps shrank from " << cached_removed_snaps
+ << " to " << newly_removed_snaps << dendl;
+ cached_removed_snaps = newly_removed_snaps;
+ newly_removed_snaps.clear();
+ }
snapc = pi->get_snap_context();
} else {
newly_removed_snaps.clear();
bool PG::_calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_map)
{
- *end = info.history.same_interval_since;
+ if (info.history.same_interval_since) {
+ *end = info.history.same_interval_since;
+ } else {
+ // PG must be imported, so let's calculate the whole range.
+ *end = osdmap_ref->get_epoch();
+ }
// Do we already have the intervals we want?
map<epoch_t,pg_interval_t>::const_iterator pif = past_intervals.begin();
epoch_t cur_epoch, end_epoch;
if (!_calc_past_interval_range(&cur_epoch, &end_epoch,
osd->get_superblock().oldest_map)) {
+ if (info.history.same_interval_since == 0)
+ info.history.same_interval_since = end_epoch;
return;
}
}
}
+ // PG import needs recalculated same_interval_since
+ if (info.history.same_interval_since == 0) {
+ assert(same_interval_since);
+ dout(10) << __func__ << " fix same_interval_since " << same_interval_since << " pg " << *this << dendl;
+ dout(10) << __func__ << " past_intervals " << past_intervals << dendl;
+ // Fix it
+ info.history.same_interval_since = same_interval_since;
+ }
+
// record our work.
dirty_info = true;
dirty_big_info = true;
dout(20) << "activate - purged_snaps " << info.purged_snaps
<< " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
snap_trimq = pool.cached_removed_snaps;
- snap_trimq.subtract(info.purged_snaps);
+ interval_set<snapid_t> intersection;
+ intersection.intersection_of(snap_trimq, info.purged_snaps);
+ if (intersection == info.purged_snaps) {
+ snap_trimq.subtract(info.purged_snaps);
+ } else {
+ dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
+ << ") is not a subset of pool.cached_removed_snaps ("
+ << pool.cached_removed_snaps << ")" << dendl;
+ snap_trimq.subtract(intersection);
+ }
dout(10) << "activate - snap_trimq " << snap_trimq << dendl;
if (!snap_trimq.empty() && is_clean())
queue_snap_trim();
// Info
child->info.history = info.history;
+ child->info.history.epoch_created = get_osdmap()->get_epoch();
child->info.purged_snaps = info.purged_snaps;
child->info.last_backfill = info.last_backfill;
child->info.stats = info.stats;
+ child->info.stats.parent_split_bits = split_bits;
info.stats.stats_invalid = true;
child->info.stats.stats_invalid = true;
child->info.last_epoch_started = info.last_epoch_started;
if (get_primary() != child->get_primary())
child->info.history.same_primary_since = get_osdmap()->get_epoch();
+ child->info.stats.up = up;
+ child->info.stats.up_primary = up_primary;
+ child->info.stats.acting = acting;
+ child->info.stats.acting_primary = primary;
+ child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
+
// History
child->past_intervals = past_intervals;
if (hoid.snap < CEPH_MAXSNAP) {
// fake nlinks for old primaries
bufferlist bl;
+ if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+ o.nlinks = 0;
+ continue;
+ }
bl.push_back(o.attrs[OI_ATTR]);
- object_info_t oi(bl);
+ object_info_t oi;
+ try {
+ oi = bl;
+ } catch(...) {
+ o.nlinks = 0;
+ continue;
+ }
if (oi.snaps.empty()) {
// Just head
o.nlinks = 1;
assert(waiting_for_unreadable_object.empty());
pg_log.missing_add(soid, oi.version, eversion_t());
+
+ pg_log.set_last_requested(0);
+ dout(10) << __func__ << ": primary = " << primary << dendl;
+ }
+
+ if (is_ec_pg() || bad_peer == primary) {
+ // we'd better collect all shard for EC pg, and prepare good peers as the
+ // source of pull in the case of replicated pg.
missing_loc.add_missing(soid, oi.version, eversion_t());
list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
for (i = ok_peers->begin();
- i != ok_peers->end();
- ++i)
+ i != ok_peers->end();
+ ++i)
missing_loc.add_location(soid, i->second);
-
- pg_log.set_last_requested(0);
- dout(10) << __func__ << ": primary = " << primary << dendl;
}
}
info.history.same_interval_since = osdmap->get_epoch();
} else {
std::stringstream debug;
+ assert(info.history.same_interval_since != 0);
boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
get_is_recoverable_predicate());
bool new_interval = pg_interval_t::check_new_interval(
return ret;
}
- const map<hobject_t, pg_missing_t::item> &get_all_missing() const {
- return needs_recovery_map;
- }
-
void clear() {
needs_recovery_map.clear();
missing_loc.clear();
}
if (auth_object.digest_present && auth_object.omap_digest_present &&
- (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
+ (!auth_oi.is_data_digest() || (!auth_oi.is_omap_digest() && auth_oi.is_omap()))) {
dout(20) << __func__ << " missing digest on " << *k << dendl;
update = MAYBE;
}
if (info.last_complete > newhead)
info.last_complete = newhead;
+ if (log.rollback_info_trimmed_to > newhead)
+ log.rollback_info_trimmed_to = newhead;
+
log.index();
map<eversion_t, hobject_t> new_priors;
ObjectStore::Transaction& t, const coll_t& coll, const ghobject_t &log_oid)
{
if (is_dirty()) {
- dout(10) << "write_log with: "
+ dout(5) << "write_log with: "
<< "dirty_to: " << dirty_to
<< ", dirty_from: " << dirty_from
- << ", dirty_divergent_priors: " << dirty_divergent_priors
+ << ", dirty_divergent_priors: "
+ << (dirty_divergent_priors ? "true" : "false")
+ << ", divergent_priors: " << divergent_priors.size()
<< ", writeout_from: " << writeout_from
<< ", trimmed: " << trimmed
<< dendl;
rm->bytes_written = rm->opt.get_encoded_bytes();
- op->mark_started();
-
rm->localt.append(rm->opt);
rm->localt.register_on_commit(
parent->bless_context(
p != info.hit_set.history.end();
++p) {
if (stamp >= p->begin && stamp <= p->end) {
- oid = get_hit_set_archive_object(p->begin, p->end);
+ oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
break;
}
}
bool can_proxy_read = get_osdmap()->get_up_osd_features() &
CEPH_FEATURE_OSD_PROXY_FEATURES;
OpRequestRef promote_op;
+ bool did_proxy_read = false;
switch (pool.info.cache_mode) {
case pg_pool_t::CACHEMODE_WRITEBACK:
return true;
}
- if (can_proxy_read)
+ if (can_proxy_read) {
do_proxy_read(op);
- else
+ did_proxy_read = true;
+ } else {
promote_op = op; // for non-proxy case promote_object needs this
+ }
// Avoid duplicate promotion
if (obc.get() && obc->is_blocked()) {
promote_object(obc, missing_oid, oloc, promote_op);
} else {
// not promoting
- return false;
+ return did_proxy_read;
}
break;
}
object_info_t &coi = obc->obs.oi;
set<snapid_t> old_snaps(coi.snaps.begin(), coi.snaps.end());
- assert(old_snaps.size());
+ if (old_snaps.empty()) {
+ osd->clog->error() << __func__ << " No object info snaps for " << coid << "\n";
+ return NULL;
+ }
SnapSet& snapset = obc->ssc->snapset;
dout(10) << coid << " old_snaps " << old_snaps
<< " old snapset " << snapset << dendl;
- assert(snapset.seq);
+ if (snapset.seq == 0) {
+ osd->clog->error() << __func__ << " No snapset.seq for " << coid << "\n";
+ return NULL;
+ }
RepGather *repop = simple_repop_create(obc);
OpContext *ctx = repop->ctx;
for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p)
if (*p == last)
break;
- assert(p != snapset.clones.end());
+ if (p == snapset.clones.end()) {
+ osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones" << "\n";
+ return NULL;
+ }
object_stat_sum_t delta;
delta.num_bytes -= snapset.get_clone_bytes(last);
write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
op.extent.offset, op.extent.length, true);
maybe_create_new_object(ctx);
- if (op.extent.offset == 0 && op.extent.length == oi.size)
+ if (op.extent.offset == 0 && op.extent.length >= oi.size)
obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+ else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
+ obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
else
obs.oi.clear_data_digest();
}
return;
}
- if (cop->omap_data.length())
+ if (cop->omap_data.length() || cop->omap_header.length())
cop->results.has_omap = true;
- if (r >= 0 && pool.info.require_rollback() && cop->omap_data.length()) {
+ if (r >= 0 && pool.info.require_rollback() &&
+ (cop->omap_data.length() || cop->omap_header.length())) {
r = -EOPNOTSUPP;
}
cop->objecter_tid = 0;
ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
dout(10) << "handle_watch_timeout obc " << obc << dendl;
+ if (!is_active()) {
+ dout(10) << "handle_watch_timeout not active, no-op" << dendl;
+ return;
+ }
if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
watch->get_delayed_cb()
info.last_update.epoch = get_osdmap()->get_epoch();
const pg_missing_t &missing = pg_log.get_missing();
map<hobject_t, pg_missing_t::item>::const_iterator m =
- missing_loc.get_all_missing().begin();
+ missing_loc.get_needs_recovery().begin();
map<hobject_t, pg_missing_t::item>::const_iterator mend =
- missing_loc.get_all_missing().end();
+ missing_loc.get_needs_recovery().end();
while (m != mend) {
const hobject_t &oid(m->first);
if (!missing_loc.is_unfound(oid)) {
return hoid;
}
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt)
{
ostringstream ss;
- ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+ ss << "hit_set_" << info.pgid.pgid << "_archive_";
+ if (using_gmt) {
+ start.gmtime(ss) << "_";
+ end.gmtime(ss);
+ } else {
+ start.localtime(ss) << "_";
+ end.localtime(ss);
+ }
hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
info.pgid.ps(), info.pgid.pool(),
cct->_conf->osd_hit_set_namespace);
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
return;
}
- oid = get_hit_set_archive_object(start, now);
+ oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
// If the current object is degraded we skip this persist request
- if (is_degraded_or_backfilling_object(oid))
- return;
if (scrubber.write_blocked_by_scrub(oid))
return;
updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
hit_set_create();
- updated_hit_set_hist.current_info = pg_hit_set_info_t();
+ updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
updated_hit_set_hist.current_last_stamp = utime_t();
// fabricate an object_info_t and SnapSet
for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
assert(p != updated_hit_set_hist.history.end());
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
assert(!is_degraded_or_backfilling_object(oid));
continue;
}
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
if (is_unreadable_object(oid)) {
dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
break;
}
}
+static bool doing_clones(const boost::optional<SnapSet> &snapset,
+ const vector<snapid_t>::reverse_iterator &curclone) {
+ return snapset && curclone != snapset.get().clones.rend();
+}
+
+void ReplicatedPG::log_missing(unsigned missing,
+ const boost::optional<hobject_t> &head,
+ LogChannelRef clog,
+ const spg_t &pgid,
+ const char *func,
+ const char *mode,
+ bool allow_incomplete_clones)
+{
+ assert(head);
+ if (allow_incomplete_clones) {
+ dout(20) << func << " " << mode << " " << pgid << " " << head.get()
+ << " skipped " << missing << " clone(s) in cache tier" << dendl;
+ } else {
+ clog->info() << mode << " " << pgid << " " << head.get()
+ << " " << missing << " missing clone(s)";
+ }
+}
+
+unsigned ReplicatedPG::process_clones_to(const boost::optional<hobject_t> &head,
+ const boost::optional<SnapSet> &snapset,
+ LogChannelRef clog,
+ const spg_t &pgid,
+ const char *mode,
+ bool allow_incomplete_clones,
+ boost::optional<snapid_t> target,
+ vector<snapid_t>::reverse_iterator *curclone)
+{
+ assert(head);
+ assert(snapset);
+ unsigned missing = 0;
+
+ // NOTE: clones are in descending order, thus **curclone > target test here
+ hobject_t next_clone(head.get());
+ while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+ ++missing;
+ // it is okay to be missing one or more clones in a cache tier.
+ // skip higher-numbered clones in the list.
+ if (!allow_incomplete_clones) {
+ next_clone.snap = **curclone;
+ clog->error() << mode << " " << pgid << " " << head.get()
+ << " expected clone " << next_clone;
+ ++scrubber.shallow_errors;
+ }
+ // Clones are descending
+ ++(*curclone);
+ }
+ return missing;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head or snapdir.
+ *
+ * Example
+ *
+ * objects expected
+ * ======= =======
+ * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
+ * obj2 head head/snapdir, head ok
+ * [SnapSet clones 6 4 2 1]
+ * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6 obj2 snap 6, match
+ * obj2 snap 4 obj2 snap 4, match
+ * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
+ * [Snapset clones 3 1]
+ * obj3 snap 3 obj3 snap 3 match
+ * obj3 snap 1 obj3 snap 1 match
+ * obj4 snapdir head/snapdir, snapdir ok
+ * [Snapset clones 4]
+ * EOL obj4 snap 4, (expected)
+ */
void ReplicatedPG::_scrub(
ScrubMap &scrubmap,
const map<hobject_t, pair<uint32_t, uint32_t> > &missing_digest)
bool repair = state_test(PG_STATE_REPAIR);
bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
+ boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
// traverse in reverse order.
- hobject_t head;
- SnapSet snapset;
- vector<snapid_t>::reverse_iterator curclone;
- hobject_t next_clone;
+ boost::optional<hobject_t> head;
+ boost::optional<SnapSet> snapset; // If initialized so will head (above)
+ vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
+ bool missing = false;
bufferlist last_data;
- for (map<hobject_t,ScrubMap::object>::reverse_iterator p = scrubmap.objects.rbegin();
- p != scrubmap.objects.rend();
- ++p) {
+ for (map<hobject_t,ScrubMap::object>::reverse_iterator
+ p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
const hobject_t& soid = p->first;
object_stat_sum_t stat;
- if (soid.snap != CEPH_SNAPDIR)
+ boost::optional<object_info_t> oi;
+
+ if (!soid.is_snapdir())
stat.num_objects++;
if (soid.nspace == cct->_conf->osd_hit_set_namespace)
stat.num_objects_hit_set_archive++;
- // new snapset?
- if (soid.snap == CEPH_SNAPDIR ||
- soid.snap == CEPH_NOSNAP) {
- if (p->second.attrs.count(SS_ATTR) == 0) {
- osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " no '" << SS_ATTR << "' attr";
- ++scrubber.shallow_errors;
- continue;
- }
- bufferlist bl;
- bl.push_back(p->second.attrs[SS_ATTR]);
- bufferlist::iterator blp = bl.begin();
- ::decode(snapset, blp);
-
- // did we finish the last oid?
- if (head != hobject_t() &&
- !pool.info.allow_incomplete_clones()) {
- osd->clog->error() << mode << " " << info.pgid << " " << head
- << " missing clones";
- ++scrubber.shallow_errors;
- }
-
- // what will be next?
- if (snapset.clones.empty())
- head = hobject_t(); // no clones.
- else {
- curclone = snapset.clones.rbegin();
- head = p->first;
- next_clone = hobject_t();
- dout(20) << " snapset " << snapset << dendl;
- }
+ if (soid.is_snap()) {
+ // it's a clone
+ stat.num_object_clones++;
}
// basic checks.
if (p->second.attrs.count(OI_ATTR) == 0) {
+ oi = boost::none;
osd->clog->error() << mode << " " << info.pgid << " " << soid
<< " no '" << OI_ATTR << "' attr";
++scrubber.shallow_errors;
- continue;
+ } else {
+ bufferlist bv;
+ bv.push_back(p->second.attrs[OI_ATTR]);
+ try {
+ oi = object_info_t(); // Initialize optional<> before decode into it
+ oi.get().decode(bv);
+ } catch (buffer::error& e) {
+ oi = boost::none;
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " can't decode '" << OI_ATTR << "' attr " << e.what();
+ ++scrubber.shallow_errors;
+ }
}
- bufferlist bv;
- bv.push_back(p->second.attrs[OI_ATTR]);
- object_info_t oi(bv);
- if (pgbackend->be_get_ondisk_size(oi.size) != p->second.size) {
- osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " on disk size (" << p->second.size
- << ") does not match object info size ("
- << oi.size << ") adjusted for ondisk to ("
- << pgbackend->be_get_ondisk_size(oi.size)
- << ")";
- ++scrubber.shallow_errors;
- }
+ if (oi) {
+ if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " on disk size (" << p->second.size
+ << ") does not match object info size ("
+ << oi->size << ") adjusted for ondisk to ("
+ << pgbackend->be_get_ondisk_size(oi->size)
+ << ")";
+ ++scrubber.shallow_errors;
+ }
- dout(20) << mode << " " << soid << " " << oi << dendl;
+ dout(20) << mode << " " << soid << " " << oi.get() << dendl;
- if (soid.is_snap()) {
- stat.num_bytes += snapset.get_clone_bytes(soid.snap);
- } else {
- stat.num_bytes += oi.size;
+ // A clone num_bytes will be added later when we have snapset
+ if (!soid.is_snap()) {
+ stat.num_bytes += oi->size;
+ }
+ if (soid.nspace == cct->_conf->osd_hit_set_namespace)
+ stat.num_bytes_hit_set_archive += oi->size;
+
+ if (!soid.is_snapdir()) {
+ if (oi->is_dirty())
+ ++stat.num_objects_dirty;
+ if (oi->is_whiteout())
+ ++stat.num_whiteouts;
+ if (oi->is_omap())
+ ++stat.num_objects_omap;
+ }
}
- if (soid.nspace == cct->_conf->osd_hit_set_namespace)
- stat.num_bytes_hit_set_archive += oi.size;
-
- if (!soid.is_snapdir()) {
- if (oi.is_dirty())
- ++stat.num_objects_dirty;
- if (oi.is_whiteout())
- ++stat.num_whiteouts;
- if (oi.is_omap())
- ++stat.num_objects_omap;
- }
-
- if (!next_clone.is_min() && next_clone != soid &&
- pool.info.allow_incomplete_clones()) {
- // it is okay to be missing one or more clones in a cache tier.
- // skip higher-numbered clones in the list.
- while (curclone != snapset.clones.rend() &&
- soid.snap < *curclone)
- ++curclone;
- if (curclone != snapset.clones.rend() &&
- soid.snap == *curclone) {
- dout(20) << __func__ << " skipped some clones in cache tier" << dendl;
- next_clone.snap = *curclone;
- }
- if (curclone == snapset.clones.rend() ||
- soid.snap == CEPH_NOSNAP) {
- dout(20) << __func__ << " skipped remaining clones in cache tier"
- << dendl;
- next_clone = hobject_t();
- head = hobject_t();
+
+ // Check for any problems while processing clones
+ if (doing_clones(snapset, curclone)) {
+ boost::optional<snapid_t> target;
+ // Expecting an object with snap for current head
+ if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+ dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
+ << soid << " while processing " << head.get() << dendl;
+
+ target = all_clones;
+ } else {
+ assert(soid.is_snap());
+ target = soid.snap;
}
+
+ // Log any clones we were expecting to be there up to target
+ // This will set missing, but will be a no-op if snap.soid == *curclone.
+ missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
+ pool.info.allow_incomplete_clones(), target, &curclone);
}
- if (!next_clone.is_min() && next_clone != soid) {
+ bool expected;
+ // Check doing_clones() again in case we ran process_clones_to()
+ if (doing_clones(snapset, curclone)) {
+ // A head/snapdir would have processed all clones above
+ // or all greater than *curclone.
+ assert(soid.is_snap() && *curclone <= soid.snap);
+
+ // After processing above clone snap should match the expected curclone
+ expected = (*curclone == soid.snap);
+ } else {
+ // If we aren't doing clones any longer, then expecting head/snapdir
+ expected = soid.has_snapset();
+ }
+ if (!expected) {
+ // If we couldn't read the head's snapset, then just ignore clones and
+ // don't count as an error.
+ if (head && !snapset) {
+ osd->clog->info() << mode << " " << info.pgid << " " << soid
+ << " clone ignored due to missing snapset";
+ continue;
+ }
osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " expected clone " << next_clone;
+ << " is an unexpected clone";
++scrubber.shallow_errors;
+ continue;
}
- if (soid.snap == CEPH_NOSNAP || soid.snap == CEPH_SNAPDIR) {
- if (soid.snap == CEPH_NOSNAP && !snapset.head_exists) {
- osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " snapset.head_exists=false, but head exists";
- ++scrubber.shallow_errors;
+ // new snapset?
+ if (soid.has_snapset()) {
+
+ if (missing) {
+ log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
+ pool.info.allow_incomplete_clones());
}
- if (soid.snap == CEPH_SNAPDIR && snapset.head_exists) {
+
+ // Set this as a new head object
+ head = soid;
+ missing = false;
+
+ dout(20) << __func__ << " " << mode << " new head " << head << dendl;
+
+ if (p->second.attrs.count(SS_ATTR) == 0) {
osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " snapset.head_exists=true, but snapdir exists";
+ << " no '" << SS_ATTR << "' attr";
++scrubber.shallow_errors;
- }
- if (curclone == snapset.clones.rend()) {
- next_clone = hobject_t();
+ snapset = boost::none;
} else {
- next_clone = soid;
- next_clone.snap = *curclone;
- }
- } else if (soid.snap) {
- // it's a clone
- stat.num_object_clones++;
-
- if (head == hobject_t()) {
- osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " found clone without head";
- ++scrubber.shallow_errors;
- continue;
+ bufferlist bl;
+ bl.push_back(p->second.attrs[SS_ATTR]);
+ bufferlist::iterator blp = bl.begin();
+ try {
+ snapset = SnapSet(); // Initialize optional<> before decoding into it
+ ::decode(snapset.get(), blp);
+ } catch (buffer::error& e) {
+ snapset = boost::none;
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " can't decode '" << SS_ATTR << "' attr " << e.what();
+ ++scrubber.shallow_errors;
+ }
}
- if (soid.snap != *curclone) {
- continue; // we warn above. we could do better here...
+ if (snapset) {
+ // what will be next?
+ curclone = snapset->clones.rbegin();
+
+ if (!snapset->clones.empty()) {
+ dout(20) << " snapset " << snapset.get() << dendl;
+ if (snapset->seq == 0) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " snaps.seq not set";
+ ++scrubber.shallow_errors;
+ }
+ }
+
+ if (soid.is_head() && !snapset->head_exists) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " snapset.head_exists=false, but head exists";
+ ++scrubber.shallow_errors;
+ }
+ if (soid.is_snapdir() && snapset->head_exists) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " snapset.head_exists=true, but snapdir exists";
+ ++scrubber.shallow_errors;
+ }
}
+ } else {
+ assert(soid.is_snap());
+ assert(head);
+ assert(snapset);
+ assert(soid.snap == *curclone);
- if (oi.size != snapset.clone_size[*curclone]) {
+ dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
+
+ if (snapset->clone_size.count(soid.snap) == 0) {
osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " size " << oi.size << " != clone_size "
- << snapset.clone_size[*curclone];
+ << " is missing in clone_size";
++scrubber.shallow_errors;
- }
+ } else {
+ if (oi && oi->size != snapset->clone_size[soid.snap]) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " size " << oi->size << " != clone_size "
+ << snapset->clone_size[*curclone];
+ ++scrubber.shallow_errors;
+ }
- // verify overlap?
- // ...
+ if (snapset->clone_overlap.count(soid.snap) == 0) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " is missing in clone_overlap";
+ ++scrubber.shallow_errors;
+ } else {
+ // This checking is based on get_clone_bytes(). The first 2 asserts
+ // can't happen because we know we have a clone_size and
+ // a clone_overlap. Now we check that the interval_set won't
+ // cause the last assert.
+ uint64_t size = snapset->clone_size.find(soid.snap)->second;
+ const interval_set<uint64_t> &overlap =
+ snapset->clone_overlap.find(soid.snap)->second;
+ bool bad_interval_set = false;
+ for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+ i != overlap.end(); ++i) {
+ if (size < i.get_len()) {
+ bad_interval_set = true;
+ break;
+ }
+ size -= i.get_len();
+ }
- // what's next?
- if (curclone != snapset.clones.rend()) {
- ++curclone;
- }
- if (curclone == snapset.clones.rend()) {
- head = hobject_t();
- next_clone = hobject_t();
- } else {
- next_clone.snap = *curclone;
+ if (bad_interval_set) {
+ osd->clog->error() << mode << " " << info.pgid << " " << soid
+ << " bad interval_set in clone_overlap";
+ ++scrubber.shallow_errors;
+ } else {
+ stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+ }
+ }
}
- } else {
- // it's unversioned.
- next_clone = hobject_t();
+ // what's next?
+ ++curclone;
}
scrub_cstat.add(stat);
}
- if (!next_clone.is_min() &&
- !pool.info.allow_incomplete_clones()) {
- osd->clog->error() << mode << " " << info.pgid
- << " expected clone " << next_clone;
- ++scrubber.shallow_errors;
+ if (doing_clones(snapset, curclone)) {
+ dout(10) << __func__ << " " << mode << " " << info.pgid
+ << " No more objects while processing " << head.get() << dendl;
+
+ missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
+ pool.info.allow_incomplete_clones(), all_clones, &curclone);
+
+ }
+ // There could be missing found by the test above or even
+ // before dropping out of the loop for the last head.
+ if (missing) {
+ log_missing(missing, head, osd->clog, info.pgid, __func__,
+ mode, pool.info.allow_incomplete_clones());
}
for (map<hobject_t,pair<uint32_t,uint32_t> >::const_iterator p =
simple_repop_submit(repop);
++scrubber.num_digest_updates_pending;
}
-
+
dout(10) << "_scrub (" << mode << ") finish" << dendl;
}
void hit_set_in_memory_trim(); ///< discard old in memory HitSets
hobject_t get_hit_set_current_object(utime_t stamp);
- hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+ hobject_t get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt);
// agent
boost::scoped_ptr<TierAgentState> agent_state;
uint64_t temp_seq; ///< last id for naming temp objects
coll_t get_temp_coll(ObjectStore::Transaction *t);
hobject_t generate_temp_object(); ///< generate a new temp object name
+ void log_missing(unsigned missing,
+ const boost::optional<hobject_t> &head,
+ LogChannelRef clog,
+ const spg_t &pgid,
+ const char *func,
+ const char *mode,
+ bool allow_incomplete_clones);
+ unsigned process_clones_to(const boost::optional<hobject_t> &head,
+ const boost::optional<SnapSet> &snapset,
+ LogChannelRef clog,
+ const spg_t &pgid,
+ const char *mode,
+ bool allow_incomplete_clones,
+ boost::optional<snapid_t> target,
+ vector<snapid_t>::reverse_iterator *curclone);
+
public:
void get_colls(list<coll_t> *out) {
out->push_back(coll);
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
+ f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
f->dump_unsigned("expected_num_objects", expected_num_objects);
return;
}
- ENCODE_START(17, 5, bl);
+ if ((features & CEPH_FEATURE_OSD_HITSET_GMT) == 0) {
+ // CEPH_FEATURE_OSD_HITSET_GMT requires pg_pool_t v21 which has
+ // use_gmt_hitset, and two fields added before v21. it's backward
+ // compatible, but re-encoding the same osdmap with different ceph
+ // versions causes CRC mismatch at the OSD side, the tracker#12410
+ // prevents the monitor from sending the single full map requested
+ // by OSD. so we need a way to encode pg_pool_t the same old way.
+ ENCODE_START(17, 5, bl);
+ ::encode(type, bl);
+ ::encode(size, bl);
+ ::encode(crush_ruleset, bl);
+ ::encode(object_hash, bl);
+ ::encode(pg_num, bl);
+ ::encode(pgp_num, bl);
+ __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+ ::encode(lpg_num, bl);
+ ::encode(lpgp_num, bl);
+ ::encode(last_change, bl);
+ ::encode(snap_seq, bl);
+ ::encode(snap_epoch, bl);
+ ::encode(snaps, bl, features);
+ ::encode(removed_snaps, bl);
+ ::encode(auid, bl);
+ ::encode(flags, bl);
+ ::encode(crash_replay_interval, bl);
+ ::encode(min_size, bl);
+ ::encode(quota_max_bytes, bl);
+ ::encode(quota_max_objects, bl);
+ ::encode(tiers, bl);
+ ::encode(tier_of, bl);
+ __u8 c = cache_mode;
+ ::encode(c, bl);
+ ::encode(read_tier, bl);
+ ::encode(write_tier, bl);
+ ::encode(properties, bl);
+ ::encode(hit_set_params, bl);
+ ::encode(hit_set_period, bl);
+ ::encode(hit_set_count, bl);
+ ::encode(stripe_width, bl);
+ ::encode(target_max_bytes, bl);
+ ::encode(target_max_objects, bl);
+ ::encode(cache_target_dirty_ratio_micro, bl);
+ ::encode(cache_target_full_ratio_micro, bl);
+ ::encode(cache_min_flush_age, bl);
+ ::encode(cache_min_evict_age, bl);
+ ::encode(erasure_code_profile, bl);
+ ::encode(last_force_op_resend, bl);
+ ::encode(min_read_recency_for_promote, bl);
+ ::encode(expected_num_objects, bl);
+ ENCODE_FINISH(bl);
+ return;
+ }
+
+ ENCODE_START(21, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
::encode(last_force_op_resend, bl);
::encode(min_read_recency_for_promote, bl);
::encode(expected_num_objects, bl);
+ ::encode(uint32_t(.6 * 1e6), bl);
+ ::encode(uint32_t(1), bl);
+ ::encode(use_gmt_hitset, bl);
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(bufferlist::iterator& bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_ruleset, bl);
} else {
expected_num_objects = 0;
}
+ if (struct_v >= 19) {
+ uint32_t dummy;
+ ::decode(dummy, bl);
+ }
+ if (struct_v >= 20) {
+ uint32_t dummy;
+ ::decode(dummy, bl);
+ }
+ if (struct_v >= 21) {
+ ::decode(use_gmt_hitset, bl);
+ } else {
+ use_gmt_hitset = false;
+ }
DECODE_FINISH(bl);
calc_pg_masks();
}
pg_interval_t& i = (*past_intervals)[same_interval_since];
i.first = same_interval_since;
i.last = osdmap->get_epoch() - 1;
+ assert(i.first <= i.last);
i.acting = old_acting;
i.up = old_up;
i.primary = old_acting_primary;
for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
i != in.log.end(); ++i) {
+ // Reject pg log entries for temporary objects
+ if (i->soid.is_temp()) {
+ reject.log.push_back(*i);
+ continue;
+ }
+
if (i->soid.nspace != hit_set_namespace) {
object_t oid = i->soid.oid;
object_locator_t loc(i->soid);
void pg_hit_set_info_t::encode(bufferlist& bl) const
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(begin, bl);
::encode(end, bl);
::encode(version, bl);
+ ::encode(using_gmt, bl);
ENCODE_FINISH(bl);
}
void pg_hit_set_info_t::decode(bufferlist::iterator& p)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
::decode(begin, p);
::decode(end, p);
::decode(version, p);
+ if (struct_v >= 2) {
+ ::decode(using_gmt, p);
+ } else {
+ using_gmt = false;
+ }
DECODE_FINISH(p);
}
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->dump_stream("version") << version;
+ f->dump_stream("using_gmt") << using_gmt;
}
void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
+ bool use_gmt_hitset; ///< use gmt to name the hitset archive object
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote
uint32_t stripe_width; ///< erasure coded stripe size in bytes
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
+ use_gmt_hitset(true),
min_read_recency_for_promote(0),
stripe_width(0),
expected_num_objects(0)
struct pg_hit_set_info_t {
utime_t begin, end; ///< time interval
eversion_t version; ///< version this HitSet object was written
-
- pg_hit_set_info_t() {}
- pg_hit_set_info_t(utime_t b)
- : begin(b) {}
+ bool using_gmt; ///< use gmt for creating the hit_set archive object name
+ pg_hit_set_info_t(bool using_gmt = true)
+ : using_gmt(using_gmt) {}
+ pg_hit_set_info_t(utime_t b, bool using_gmt)
+ : begin(b), using_gmt(using_gmt) {}
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);
object_info_t(bufferlist& bl) {
decode(bl);
}
+ object_info_t operator=(bufferlist& bl) {
+ decode(bl);
+ return *this;
+ }
};
WRITE_CLASS_ENCODER(object_info_t)
<< " age " << age << dendl;
if (info->last_error)
return info->last_error;
- return age.to_msec();
+ // return a safe upper bound (we are truncating to ms)
+ return 1 + age.to_msec();
}
void Objecter::linger_cancel(LingerOp *info)
info->notify_id != m->notify_id) {
ldout(cct, 10) << __func__ << " reply notify " << m->notify_id
<< " != " << info->notify_id << ", ignoring" << dendl;
- } else {
- assert(info->on_notify_finish);
+ } else if (info->on_notify_finish) {
info->notify_result_bl->claim(m->get_data());
info->on_notify_finish->complete(m->return_code);
+
+ // if we race with reconnect we might get a second notify; only
+ // notify the caller once!
+ info->on_notify_finish = NULL;
}
} else {
finisher->queue(new C_DoWatchNotify(this, info, m));
cerr << " replicalog get get replica metadata log entry\n";
cerr << " replicalog update update replica metadata log entry\n";
cerr << " replicalog delete delete replica metadata log entry\n";
- cout << " orphans find init and run search for leaked rados objects\n";
- cout << " orphans finish clean up search for leaked rados objects\n";
+ cerr << " orphans find init and run search for leaked rados objects\n";
+ cerr << " orphans finish clean up search for leaked rados objects\n";
cerr << "options:\n";
cerr << " --uid=<id> user id\n";
cerr << " --subuser=<name> subuser name\n";
cerr << " --categories=<list> comma separated list of categories, used in usage show\n";
cerr << " --caps=<caps> list of caps (e.g., \"usage=read, write; user=read\"\n";
cerr << " --yes-i-really-mean-it required for certain operations\n";
- cerr << " --reset-regions reset regionmap when regionmap update";
+ cerr << " --reset-regions reset regionmap when regionmap update\n";
+ cerr << " --bypass-gc when specified with bucket deletion, triggers\n";
+ cerr << " object deletions by not involving GC\n";
+ cerr << " --inconsistent-index when specified with bucket deletion and bypass-gc set to true,\n";
+ cerr << " ignores bucket index consistency\n";
cerr << "\n";
cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
cerr << "\nQuota options:\n";
cerr << " --max-objects specify max objects (negative value to disable)\n";
cerr << " --max-size specify max size (in bytes, negative value to disable)\n";
cerr << " --quota-scope scope of quota (bucket, user)\n";
- cout << "\nOrphans search options:\n";
- cout << " --pool data pool to scan for leaked rados objects in\n";
- cout << " --num-shards num of shards to use for keeping the temporary scan info\n";
+ cerr << "\nOrphans search options:\n";
+ cerr << " --pool data pool to scan for leaked rados objects in\n";
+ cerr << " --num-shards num of shards to use for keeping the temporary scan info\n";
cerr << "\n";
generic_client_usage();
}
int max_concurrent_ios = 32;
uint64_t orphan_stale_secs = (24 * 3600);
+ int bypass_gc = false;
+ int inconsistent_index = false;
+
std::string val;
std::ostringstream errs;
string err;
// do nothing
} else if (ceph_argparse_binary_flag(args, i, &reset_regions, NULL, "--reset-regions", (char*)NULL)) {
// do nothing
+ } else if (ceph_argparse_binary_flag(args, i, &bypass_gc, NULL, "--bypass-gc", (char*)NULL)) {
+ // do nothing
+ } else if (ceph_argparse_binary_flag(args, i, &inconsistent_index, NULL, "--inconsistent-index", (char*)NULL)) {
+ // do nothing
} else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
caps = val;
} else if (ceph_argparse_witharg(args, i, &val, "-i", "--infile", (char*)NULL)) {
bucket_op.set_object(object);
bucket_op.set_check_objects(check_objects);
bucket_op.set_delete_children(delete_child_objects);
+ bucket_op.set_fix_index(fix);
+ bucket_op.set_max_aio(max_concurrent_ios);
// required to gather errors from operations
std::string err_msg;
}
if (opt_cmd == OPT_BUCKET_RM) {
- RGWBucketAdminOp::remove_bucket(store, bucket_op);
+ if (inconsistent_index == false) {
+ RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, true);
+ } else {
+ RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, false);
+ }
}
if (opt_cmd == OPT_GC_LIST) {
#include "rgw_user.h"
#include "rgw_string.h"
+#include "include/rados/librados.hpp"
// until everything is moved from rgw_common
#include "rgw_common.h"
ret = store->get_bucket_entrypoint_info(obj_ctx, bucket_name, ep, &ot, NULL, &attrs);
if (ret < 0 && ret != -ENOENT) {
ldout(store->ctx(), 0) << "ERROR: store->get_bucket_entrypoint_info() returned " << ret << dendl;
- } else if (ret >= 0 && ep.linked && ep.owner != user_id) {
- ldout(store->ctx(), 0) << "can't link bucket, already linked to a different user: " << ep.owner << dendl;
- return -EINVAL;
}
}
map<RGWObjCategory, RGWStorageStats> stats;
std::vector<RGWObjEnt> objs;
map<string, bool> common_prefixes;
- rgw_obj obj;
RGWBucketInfo info;
- bufferlist bl;
RGWObjectCtx obj_ctx(store);
string bucket_ver, master_ver;
if (ret < 0)
return ret;
- obj.bucket = bucket;
-
ret = store->get_bucket_info(obj_ctx, bucket.name, info, NULL);
if (ret < 0)
return ret;
while (!objs.empty()) {
std::vector<RGWObjEnt>::iterator it = objs.begin();
- for (it = objs.begin(); it != objs.end(); ++it) {
+ for (; it != objs.end(); ++it) {
ret = rgw_remove_object(store, info, bucket, (*it).key);
if (ret < 0)
return ret;
}
}
+ ret = rgw_bucket_sync_user_stats(store, bucket.name);
+ if ( ret < 0) {
+ dout(1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl;
+ }
+
RGWObjVersionTracker objv_tracker;
ret = store->delete_bucket(bucket, objv_tracker);
return ret;
}
+static int aio_wait(librados::AioCompletion *handle)
+{
+ librados::AioCompletion *c = (librados::AioCompletion *)handle;
+ c->wait_for_complete();
+ int ret = c->get_return_value();
+ c->release();
+ return ret;
+}
+
+static int drain_handles(list<librados::AioCompletion *>& pending)
+{
+ int ret = 0;
+ while (!pending.empty()) {
+ librados::AioCompletion *handle = pending.front();
+ pending.pop_front();
+ int r = aio_wait(handle);
+ if (r < 0) {
+ ret = r;
+ }
+ }
+ return ret;
+}
+
+int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket,
+ int concurrent_max, bool keep_index_consistent)
+{
+ int ret;
+ map<RGWObjCategory, RGWStorageStats> stats;
+ std::vector<RGWObjEnt> objs;
+ map<string, bool> common_prefixes;
+ RGWBucketInfo info;
+ RGWObjectCtx obj_ctx(store);
+
+ string bucket_ver, master_ver;
+
+ ret = store->get_bucket_stats(bucket, &bucket_ver, &master_ver, stats, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = store->get_bucket_info(obj_ctx, bucket.name, info, NULL);
+ if (ret < 0)
+ return ret;
+
+
+ RGWRados::Bucket target(store, info.bucket);
+ RGWRados::Bucket::List list_op(&target);
+
+ list_op.params.list_versions = true;
+
+ std::list<librados::AioCompletion*> handles;
+
+ int max = 1000;
+ int max_aio = concurrent_max;
+ ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
+ if (ret < 0)
+ return ret;
+
+ while (!objs.empty()) {
+ std::vector<RGWObjEnt>::iterator it = objs.begin();
+ for (; it != objs.end(); ++it) {
+ RGWObjState *astate = NULL;
+ rgw_obj obj(bucket, (*it).key.name);
+ obj.set_instance((*it).key.instance);
+
+ ret = store->get_obj_state(&obj_ctx, obj, &astate, NULL);
+ if (ret == -ENOENT) {
+ dout(1) << "WARNING: cannot find obj state for obj " << obj.get_object() << dendl;
+ continue;
+ }
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: get obj state returned with error " << ret << dendl;
+ return ret;
+ }
+
+ if (astate->has_manifest) {
+ rgw_obj head_obj;
+ RGWObjManifest& manifest = astate->manifest;
+ RGWObjManifest::obj_iterator miter = manifest.obj_begin();
+
+ if (miter.get_location().ns.empty()) {
+ head_obj = miter.get_location();
+ }
+
+ for (; miter != manifest.obj_end() && max_aio--; ++miter) {
+ if (!max_aio) {
+ ret = drain_handles(handles);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+ return ret;
+ }
+ max_aio = concurrent_max;
+ }
+
+ rgw_obj last_obj = miter.get_location();
+ if (last_obj == head_obj) {
+ // have the head obj deleted at the end
+ continue;
+ }
+
+ ret = store->delete_obj_aio(last_obj, bucket, info, astate, handles, keep_index_consistent);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: delete obj aio failed with " << ret << dendl;
+ return ret;
+ }
+ } // for all shadow objs
+
+ ret = store->delete_obj_aio(head_obj, bucket, info, astate, handles, keep_index_consistent);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: delete obj aio failed with " << ret << dendl;
+ return ret;
+ }
+ }
+
+ if (!max_aio) {
+ ret = drain_handles(handles);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+ return ret;
+ }
+ max_aio = concurrent_max;
+ }
+ } // for all RGW objects
+ objs.clear();
+
+ ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = drain_handles(handles);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+ return ret;
+ }
+
+ ret = rgw_bucket_sync_user_stats(store, bucket.name);
+ if (ret < 0) {
+ dout(1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl;
+ }
+
+ RGWObjVersionTracker objv_tracker;
+
+ ret = rgw_bucket_delete_bucket_obj(store, bucket.name, objv_tracker);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << "with ret as " << ret << dendl;
+ return ret;
+ }
+
+ if (!store->is_syncing_bucket_meta(bucket)) {
+ RGWObjVersionTracker objv_tracker;
+ string entry;
+ store->get_bucket_instance_entry(bucket, entry);
+ ret = rgw_bucket_instance_remove_entry(store, entry, &objv_tracker);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: could not remove bucket instance entry" << bucket.name << "with ret as " << ret << dendl;
+ return ret;
+ }
+ }
+
+ ret = rgw_unlink_bucket(store, info.owner, bucket.name, false);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: unable to remove user bucket information" << dendl;
+ }
+
+ return ret;
+}
+
int rgw_bucket_delete_bucket_obj(RGWRados *store, string& bucket_name, RGWObjVersionTracker& objv_tracker)
{
return store->meta_mgr->remove_entry(bucket_meta_handler, bucket_name, &objv_tracker);
return -EIO;
}
- r = rgw_unlink_bucket(store, owner.get_id(), bucket.name);
+ r = rgw_unlink_bucket(store, owner.get_id(), bucket.name, false);
if (r < 0) {
set_err_msg(err_msg, "could not unlink policy from user " + owner.get_id());
return r;
if (r < 0)
return r;
+ RGWAccessControlPolicy policy_instance;
+ policy_instance.create_default(user_info.user_id, display_name);
+ aclbl.clear();
+ policy_instance.encode(aclbl);
+
+ string oid_bucket_instance = RGW_BUCKET_INSTANCE_MD_PREFIX + key;
+ rgw_bucket bucket_instance;
+ bucket_instance.name = oid_bucket_instance;
+ rgw_obj obj_bucket_instance(bucket_instance, no_oid);
+ r = store->set_attr(NULL, obj_bucket_instance, RGW_ATTR_ACL, aclbl, &objv_tracker);
+
r = rgw_link_bucket(store, user_info.user_id, bucket, 0);
if (r < 0)
return r;
return r;
}
-int RGWBucket::remove(RGWBucketAdminOpState& op_state, std::string *err_msg)
+int RGWBucket::remove(RGWBucketAdminOpState& op_state, bool bypass_gc,
+ bool keep_index_consistent, std::string *err_msg)
{
bool delete_children = op_state.will_delete_children();
rgw_bucket bucket = op_state.get_bucket();
+ int ret;
+
+ if (bypass_gc) {
+ if (delete_children) {
+ ret = rgw_remove_bucket_bypass_gc(store, bucket, op_state.get_max_aio(), keep_index_consistent);
+ } else {
+ set_err_msg(err_msg, "purge objects should be set for gc to be bypassed");
+ return -EINVAL;
+ }
+ } else {
+ ret = rgw_remove_bucket(store, bucket_info.owner, bucket, delete_children);
+ }
- int ret = rgw_remove_bucket(store, bucket_info.owner, bucket, delete_children);
if (ret < 0) {
set_err_msg(err_msg, "unable to remove bucket" + cpp_strerror(-ret));
return ret;
return 0;
}
-int RGWBucketAdminOp::remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state)
+int RGWBucketAdminOp::remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state,
+ bool bypass_gc, bool keep_index_consistent)
{
RGWBucket bucket;
if (ret < 0)
return ret;
- return bucket.remove(op_state);
+ return bucket.remove(op_state, bypass_gc, keep_index_consistent);
}
int RGWBucketAdminOp::remove_object(RGWRados *store, RGWBucketAdminOpState& op_state)
int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker,
time_t mtime, JSONObj *obj, sync_type_t sync_type) {
RGWBucketEntryPoint be, old_be;
- decode_json_obj(be, obj);
+ try {
+ decode_json_obj(be, obj);
+ } catch (JSONDecoder::err& e) {
+ return -EINVAL;
+ }
time_t orig_mtime;
map<string, bufferlist> attrs;
int put(RGWRados *store, string& oid, RGWObjVersionTracker& objv_tracker,
time_t mtime, JSONObj *obj, sync_type_t sync_type) {
RGWBucketCompleteInfo bci, old_bci;
- decode_json_obj(bci, obj);
+ try {
+ decode_json_obj(bci, obj);
+ } catch (JSONDecoder::err& e) {
+ return -EINVAL;
+ }
time_t orig_mtime;
RGWObjectCtx obj_ctx(store);
extern int rgw_remove_object(RGWRados *store, RGWBucketInfo& bucket_info, rgw_bucket& bucket, rgw_obj_key& key);
extern int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& bucket, bool delete_children);
+extern int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket, int concurrent_max);
extern int rgw_bucket_set_attrs(RGWRados *store, RGWBucketInfo& bucket_info,
map<string, bufferlist>& attrs,
bool fix_index;
bool delete_child_objects;
bool bucket_stored;
+ int max_aio;
rgw_bucket bucket;
void set_fix_index(bool value) { fix_index = value; }
void set_delete_children(bool value) { delete_child_objects = value; }
+ void set_max_aio(int value) { max_aio = value; }
+
void set_user_id(std::string& user_id) {
if (!user_id.empty())
uid = user_id;
bool is_user_op() { return !uid.empty(); }
bool is_system_op() { return uid.empty(); }
bool has_bucket_stored() { return bucket_stored; }
+ int get_max_aio() { return max_aio; }
RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false),
fix_index(false), delete_child_objects(false),
map<RGWObjCategory, RGWStorageStats>& calculated_stats,
std::string *err_msg = NULL);
- int remove(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
+ int remove(RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true, std::string *err_msg = NULL);
int link(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
int unlink(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
static int check_index(RGWRados *store, RGWBucketAdminOpState& op_state,
RGWFormatterFlusher& flusher);
- static int remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state);
+ static int remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true);
static int remove_object(RGWRados *store, RGWBucketAdminOpState& op_state);
static int info(RGWRados *store, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher);
};
return r;
}
-RGWMongoose::RGWMongoose(mg_connection *_conn, int _port) : conn(_conn), port(_port), header_done(false), sent_header(false), has_content_length(false),
+RGWMongoose::RGWMongoose(mg_connection *_conn, int _port) : conn(_conn), port(_port), status_num(0), header_done(false),
+ sent_header(false), has_content_length(false),
explicit_keepalive(false), explicit_conn_close(false)
{
}
{
if (!sent_header) {
if (!has_content_length) {
+
header_done = false; /* let's go back to writing the header */
- if (0 && data.length() == 0) {
+ /*
+ * Status 204 should not include a content-length header
+ * RFC7230 says so
+ *
+ * Same goes for status 304: Not Modified
+ *
+ * 'If a cache uses a received 304 response to update a cache entry,'
+ * 'the cache MUST update the entry to reflect any new field values'
+ * 'given in the response.'
+ *
+ */
+ if (status_num == 204 || status_num == 304) {
+ has_content_length = true;
+ } else if (0 && data.length() == 0) {
has_content_length = true;
print("Transfer-Enconding: %s\r\n", "chunked");
data.append("0\r\n\r\n", sizeof("0\r\n\r\n")-1);
{
env.init(cct);
struct mg_request_info *info = mg_get_request_info(conn);
+
if (!info)
return;
*dest = c;
}
*dest = '\0';
-
+
env.set(buf, header->value);
}
}
}
-int RGWMongoose::send_status(const char *status, const char *status_name)
+int RGWMongoose::send_status(int status, const char *status_name)
{
char buf[128];
if (!status_name)
status_name = "";
- snprintf(buf, sizeof(buf), "HTTP/1.1 %s %s\r\n", status, status_name);
+ snprintf(buf, sizeof(buf), "HTTP/1.1 %d %s\r\n", status, status_name);
bufferlist bl;
bl.append(buf);
bl.append(header_data);
header_data = bl;
- int status_num = atoi(status);
+ status_num = status;
mg_set_http_status(conn, status_num);
return 0;
bufferlist data;
int port;
+ int status_num;
bool header_done;
bool sent_header;
int write_data(const char *buf, int len);
int read_data(char *buf, int len);
- int send_status(const char *status, const char *status_name);
+ int send_status(int status, const char *status_name);
int send_100_continue();
int complete_header();
int complete_request();
virtual void flush() = 0;
int read(char *buf, int max, int *actual);
- virtual int send_status(const char *status, const char *status_name) = 0;
+ virtual int send_status(int status, const char *status_name) = 0;
virtual int send_100_continue() = 0;
virtual int complete_header() = 0;
virtual int complete_request() = 0;
return false;
}
+bool RGWCORSRule::has_wildcard_origin() {
+ if (allowed_origins.find("*") != allowed_origins.end())
+ return true;
+
+ return false;
+}
+
bool RGWCORSRule::is_origin_present(const char *o) {
string origin = o;
return is_string_in_set(allowed_origins, origin);
::decode(exposable_hdrs, bl);
DECODE_FINISH(bl);
}
+ bool has_wildcard_origin();
bool is_origin_present(const char *o);
void format_exp_headers(std::string& s);
void erase_origin_if_present(std::string& origin, bool *rule_empty);
env.init(cct, (char **)fcgx->envp);
}
-int RGWFCGX::send_status(const char *status, const char *status_name)
+int RGWFCGX::send_status(int status, const char *status_name)
{
- return print("Status: %s %s\r\n", status, status_name);
+ status_num = status;
+ return print("Status: %d %s\r\n", status, status_name);
}
int RGWFCGX::send_100_continue()
{
- int r = send_status("100", "Continue");
+ int r = send_status(100, "Continue");
if (r >= 0) {
flush();
}
int RGWFCGX::send_content_length(uint64_t len)
{
+ /*
+ * Status 204 should not include a content-length header
+ * RFC7230 says so
+ */
+ if (status_num == 204)
+ return 0;
+
char buf[21];
snprintf(buf, sizeof(buf), "%" PRIu64, len);
return print("Content-Length: %s\r\n", buf);
{
return print("\r\n");
}
-
class RGWFCGX : public RGWClientIO
{
FCGX_Request *fcgx;
+
+ int status_num;
+
protected:
void init_env(CephContext *cct);
int write_data(const char *buf, int len);
int read_data(char *buf, int len);
- int send_status(const char *status, const char *status_name);
+ int send_status(int status, const char *status_name);
int send_100_continue();
int complete_header();
int complete_request() { return 0; }
int send_content_length(uint64_t len);
public:
- RGWFCGX(FCGX_Request *_fcgx) : fcgx(_fcgx) {}
+ RGWFCGX(FCGX_Request *_fcgx) : fcgx(_fcgx), status_num(0) {}
void flush();
};
if (http_err >= 200 && http_err <= 299)
return 0;
switch (http_err) {
+ case 304:
+ return -ERR_NOT_MODIFIED;
case 400:
return -EINVAL;
case 401:
{
JSONDecoder::decode_json("regions", regions, obj);
JSONDecoder::decode_json("master_region", master_region, obj);
+ JSONDecoder::decode_json("bucket_quota", bucket_quota, obj);
JSONDecoder::decode_json("user_quota", user_quota, obj);
}
env.set("SERVER_PORT", port_buf);
}
-int RGWLoadGenIO::send_status(const char *status, const char *status_name)
+int RGWLoadGenIO::send_status(int status, const char *status_name)
{
return 0;
}
int write_data(const char *buf, int len);
int read_data(char *buf, int len);
- int send_status(const char *status, const char *status_name);
+ int send_status(int status, const char *status_name);
int send_100_continue();
int complete_header();
int complete_request();
for (list<string>::iterator iter = frontends.begin(); iter != frontends.end(); ++iter) {
string& f = *iter;
+ if (f.find("civetweb") != string::npos) {
+ if (f.find("port") != string::npos) {
+ // check for the most common ws problems
+ if ((f.find("port=") == string::npos) ||
+ (f.find("port= ") != string::npos)) {
+ derr << "WARNING: civetweb frontend config found unexpected spacing around 'port' (ensure civetweb port parameter has the form 'port=80' with no spaces before or after '=')" << dendl;
+ }
+ }
+ }
+
RGWFrontendConfig *config = new RGWFrontendConfig(f);
int r = config->init();
if (r < 0) {
time_t mtime = 0;
- JSONDecoder::decode_json("key", metadata_key, &parser);
- JSONDecoder::decode_json("ver", *objv, &parser);
- JSONDecoder::decode_json("mtime", mtime, &parser);
+ try {
+ JSONDecoder::decode_json("key", metadata_key, &parser);
+ JSONDecoder::decode_json("ver", *objv, &parser);
+ JSONDecoder::decode_json("mtime", mtime, &parser);
+ } catch (JSONDecoder::err& e) {
+ return -EINVAL;
+ }
JSONObj *jo = parser.find_obj("data");
if (!jo) {
if (!rule)
return false;
+ /*
+ * Set the Allowed-Origin header to a asterisk if this is allowed in the rule
+ * and no Authorization was send by the client
+ *
+ * The origin parameter specifies a URI that may access the resource. The browser must enforce this.
+ * For requests without credentials, the server may specify "*" as a wildcard,
+ * thereby allowing any origin to access the resource.
+ */
+ const char *authorization = s->info.env->get("HTTP_AUTHORIZATION");
+ if (!authorization && rule->has_wildcard_origin())
+ origin = "*";
+
/* CORS 6.2.3. */
const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
if (!req_meth) {
ret = handle_user_manifest(attr_iter->second.c_str());
if (ret < 0) {
ldout(s->cct, 0) << "ERROR: failed to handle user manifest ret=" << ret << dendl;
+ goto done_err;
}
return;
}
}
}
+ ret = rgw_bucket_sync_user_stats(store, s->user.user_id, s->bucket);
+ if ( ret < 0) {
+ ldout(s->cct, 1) << "WARNING: failed to sync user stats before bucket delete: ret= " << ret << dendl;
+ }
+
ret = store->delete_bucket(s->bucket, ot);
if (ret == 0) {
do {
void *handle;
+ rgw_obj obj;
- int ret = processor->handle_data(data, ofs, hash, &handle, &again);
+ int ret = processor->handle_data(data, ofs, hash, &handle, &obj, &again);
if (ret < 0)
return ret;
- ret = processor->throttle_data(handle, need_to_wait);
+ ret = processor->throttle_data(handle, obj, need_to_wait);
if (ret < 0)
return ret;
if_match,
if_nomatch,
attrs_mod,
+ copy_if_newer,
attrs, RGW_OBJ_CATEGORY_MAIN,
olh_epoch,
(version_id.empty() ? NULL : &version_id),
string version_id;
uint64_t olh_epoch;
+ bool copy_if_newer;
int init_common();
attrs_mod = RGWRados::ATTRSMOD_NONE;
last_ofs = 0;
olh_epoch = 0;
+ copy_if_newer = false;
}
static bool parse_copy_location(const string& src, string& bucket_name, rgw_obj_key& object);
RGWRados *store;
RGWBucketStatsCache bucket_stats_cache;
RGWUserStatsCache user_stats_cache;
+ RGWQuotaInfo def_bucket_quota;
+ RGWQuotaInfo def_user_quota;
int check_quota(const char *entity, RGWQuotaInfo& quota, RGWStorageStats& stats,
uint64_t num_objs, uint64_t size_kb) {
+ if (!quota.enabled)
+ return 0;
+
ldout(store->ctx(), 20) << entity << " quota: max_objects=" << quota.max_objects
<< " max_size_kb=" << quota.max_size_kb << dendl;
return 0;
}
public:
- RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), bucket_stats_cache(_store), user_stats_cache(_store, quota_threads) {}
+ RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), bucket_stats_cache(_store), user_stats_cache(_store, quota_threads) {
+ if (store->ctx()->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+ def_bucket_quota.max_objects = store->ctx()->_conf->rgw_bucket_default_quota_max_objects;
+ def_bucket_quota.enabled = true;
+ }
+ if (store->ctx()->_conf->rgw_bucket_default_quota_max_size >= 0) {
+ def_bucket_quota.max_size_kb = store->ctx()->_conf->rgw_bucket_default_quota_max_size;
+ def_bucket_quota.enabled = true;
+ }
+ if (store->ctx()->_conf->rgw_user_default_quota_max_objects >= 0) {
+ def_user_quota.max_objects = store->ctx()->_conf->rgw_user_default_quota_max_objects;
+ def_user_quota.enabled = true;
+ }
+ if (store->ctx()->_conf->rgw_user_default_quota_max_size >= 0) {
+ def_user_quota.max_size_kb = store->ctx()->_conf->rgw_user_default_quota_max_size;
+ def_user_quota.enabled = true;
+ }
+ }
virtual int check_quota(const string& user, rgw_bucket& bucket,
RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
uint64_t num_objs, uint64_t size) {
- if (!bucket_quota.enabled && !user_quota.enabled)
+ if (!bucket_quota.enabled && !user_quota.enabled && !def_bucket_quota.enabled && !def_user_quota.enabled)
return 0;
uint64_t size_kb = rgw_rounded_objsize_kb(size);
return ret;
}
- if (user_quota.enabled) {
+ if (def_bucket_quota.enabled) {
+ ret = check_quota("def_bucket", def_bucket_quota, bucket_stats, num_objs, size_kb);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (user_quota.enabled || def_user_quota.enabled) {
RGWStorageStats user_stats;
ret = user_stats_cache.get_stats(user, bucket, user_stats, user_quota);
if (ret < 0)
return ret;
- ret = check_quota("user", user_quota, user_stats, num_objs, size_kb);
- if (ret < 0)
- return ret;
+ if (user_quota.enabled) {
+ ret = check_quota("user", user_quota, user_stats, num_objs, size_kb);
+ if (ret < 0)
+ return ret;
+ } else if (def_user_quota.enabled) {
+ ret = check_quota("def_user", def_user_quota, user_stats, num_objs, size_kb);
+ if (ret < 0)
+ return ret;
+ }
}
return 0;
if (r < 0)
return r;
- is_complete = true;
+ is_complete = !canceled;
return 0;
}
if (is_complete)
return;
- list<rgw_obj>::iterator iter;
+ set<rgw_obj>::iterator iter;
bool is_multipart_obj = false;
rgw_obj multipart_obj;
* details is describled on #11749
*/
for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
- rgw_obj &obj = *iter;
+ const rgw_obj &obj = *iter;
if (RGW_OBJ_NS_MULTIPART == obj.ns) {
ldout(store->ctx(), 5) << "NOTE: we should not process the multipart object (" << obj << ") here" << dendl;
multipart_obj = *iter;
obj_len = abs_ofs + bl.length();
if (!(obj == last_written_obj)) {
- add_written_obj(obj);
last_written_obj = obj;
}
bl,
((ofs != 0) ? ofs : -1),
exclusive, phandle);
-
return r;
}
}
struct put_obj_aio_info info = pop_pending();
int ret = store->aio_wait(info.handle);
+
+ if (ret >= 0) {
+ add_written_obj(info.obj);
+ }
+
return ret;
}
return ret;
}
-int RGWPutObjProcessor_Aio::throttle_data(void *handle, bool need_to_wait)
+int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_obj& obj, bool need_to_wait)
{
if (handle) {
struct put_obj_aio_info info;
info.handle = handle;
+ info.obj = obj;
pending.push_back(info);
}
size_t orig_size = pending.size();
return 0;
}
-int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, bool exclusive)
+int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_obj *pobj, bool exclusive)
{
if (ofs >= next_part_ofs) {
int r = prepare_next_part(ofs);
}
}
+ *pobj = cur_obj;
+
return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
}
-int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again)
+int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, rgw_obj *pobj, bool *again)
{
*again = false;
bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
we could be racing with another upload, to the same
object and cleanup can be messy */
- int ret = write_data(bl, write_ofs, phandle, exclusive);
+ int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
if (hash) {
hash->Update((const byte *)bl.c_str(), bl.length());
first_chunk.claim(pending_data_bl);
obj_len = (uint64_t)first_chunk.length();
}
- if (pending_data_bl.length()) {
+ while (pending_data_bl.length()) {
void *handle;
- int r = write_data(pending_data_bl, data_ofs, &handle, false);
+ uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
+ if (max_write_size > pending_data_bl.length()) {
+ max_write_size = pending_data_bl.length();
+ }
+ bufferlist bl;
+ pending_data_bl.splice(0, max_write_size, &bl);
+ rgw_obj obj;
+ int r = write_data(bl, data_ofs, &handle, &obj, false);
if (r < 0) {
ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
return r;
}
- r = throttle_data(handle, false);
+ data_ofs += bl.length();
+ r = throttle_data(handle, obj, false);
if (r < 0) {
ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
return r;
}
+
+ if (data_ofs >= next_part_ofs) {
+ r = prepare_next_part(data_ofs);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
+ return r;
+ }
+ }
}
int r = complete_parts();
if (r < 0) {
return r;
}
+ canceled = obj_op.meta.canceled;
+
return 0;
}
}
*alignment = ioctx.pool_required_alignment();
+ if (*alignment != 0) {
+ ldout(cct, 20) << "required alignment=" << *alignment << dendl;
+ }
+
return 0;
}
*max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
+ ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
+
return 0;
}
}
}
+ meta.canceled = false;
+
/* update quota cache */
store->quota_handler->update_stats(meta.owner, bucket, (orig_exists ? 0 : 1), size, orig_size);
ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
}
+ meta.canceled = true;
+
/* we lost in a race. There are a few options:
* - existing object was rewritten (ECANCELED)
* - non existing object was created (EEXIST)
do {
void *handle;
- int ret = processor->handle_data(bl, ofs, NULL, &handle, &again);
+ rgw_obj obj;
+ int ret = processor->handle_data(bl, ofs, NULL, &handle, &obj, &again);
if (ret < 0)
return ret;
ret = opstate->renew_state();
if (ret < 0) {
ldout(processor->ctx(), 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
- int r = processor->throttle_data(handle, false);
+ int r = processor->throttle_data(handle, obj, false);
if (r < 0) {
ldout(processor->ctx(), 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
}
need_opstate = false;
}
- ret = processor->throttle_data(handle, false);
+ ret = processor->throttle_data(handle, obj, false);
if (ret < 0)
return ret;
} while (again);
int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs) {
return processor->complete(etag, mtime, set_mtime, attrs);
}
+
+ bool is_canceled() {
+ return processor->is_canceled();
+ }
};
/*
const char *if_match,
const char *if_nomatch,
AttrsMod attrs_mod,
+ bool copy_if_newer,
map<string, bufferlist>& attrs,
RGWObjCategory category,
uint64_t olh_epoch,
RGWRESTStreamReadRequest *in_stream_req;
string tag;
map<string, bufferlist> src_attrs;
+ int i;
append_rand_alpha(cct, tag, tag, 32);
RGWPutObjProcessor_Atomic processor(obj_ctx,
string etag;
map<string, string> req_headers;
time_t set_mtime;
+
+ RGWObjState *dest_state = NULL;
+
+ time_t dest_mtime;
+ const time_t *pmod = mod_ptr;
+
+ if (copy_if_newer) {
+ /* need to get mtime for destination */
+ ret = get_obj_state(&obj_ctx, dest_obj, &dest_state, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (dest_state->exists) {
+ dest_mtime = dest_state->mtime;
+ pmod = &dest_mtime;
+ }
+ }
+
- ret = conn->get_obj(user_id, info, src_obj, true, &cb, &in_stream_req);
+ ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr, true, &cb, &in_stream_req);
if (ret < 0) {
goto set_err_state;
}
attrs = src_attrs;
}
- ret = cb.complete(etag, mtime, set_mtime, attrs);
- if (ret < 0) {
+#define MAX_COMPLETE_RETRY 100
+ for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
+ ret = cb.complete(etag, mtime, set_mtime, attrs);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+ if (copy_if_newer && cb.is_canceled()) {
+ ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
+ obj_ctx.invalidate(dest_obj); /* object was overwritten */
+ ret = get_obj_state(&obj_ctx, dest_obj, &dest_state, NULL);
+ if (ret < 0) {
+ ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
+ goto set_err_state;
+ }
+ if (!dest_state->exists ||
+ dest_state->mtime < set_mtime) {
+ ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+ continue;
+ } else {
+ ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+ }
+ }
+ break;
+ }
+
+ if (i == MAX_COMPLETE_RETRY) {
+ ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
+ ret = -EIO;
goto set_err_state;
}
return 0;
set_err_state:
- int r = opstate.set_state(RGWOpState::OPSTATE_ERROR);
+ RGWOpState::OpState state = RGWOpState::OPSTATE_ERROR;
+ if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
+ state = RGWOpState::OPSTATE_COMPLETE;
+ ret = 0;
+ }
+ int r = opstate.set_state(state);
if (r < 0) {
ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
}
const char *if_match,
const char *if_nomatch,
AttrsMod attrs_mod,
+ bool copy_if_newer,
map<string, bufferlist>& attrs,
RGWObjCategory category,
uint64_t olh_epoch,
if (remote_src || !source_zone.empty()) {
return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, info, source_zone,
dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
- unmod_ptr, if_match, if_nomatch, attrs_mod, attrs, category,
+ unmod_ptr, if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
olh_epoch, version_id, ptag, petag, err, progress_cb, progress_data);
}
do {
void *handle;
+ rgw_obj obj;
- ret = processor.handle_data(bl, ofs, NULL, &handle, &again);
+ ret = processor.handle_data(bl, ofs, NULL, &handle, &obj, &again);
if (ret < 0) {
return ret;
}
- ret = processor.throttle_data(handle, false);
+ ret = processor.throttle_data(handle, obj, false);
if (ret < 0)
return ret;
} while (again);
int RGWRados::delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_tracker)
{
librados::IoCtx index_ctx;
- string oid;
- int r = open_bucket_index(bucket, index_ctx, oid);
+ map<int, string> bucket_objs;
+ int r = open_bucket_index(bucket, index_ctx, bucket_objs);
if (r < 0)
return r;
if (r < 0) {
return r;
}
+ /* remove bucket index objects*/
+ map<int, string>::const_iterator biter;
+ for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
+ index_ctx.remove(biter->second);
+ }
}
return 0;
}
return 0;
}
-int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw_obj& obj,
+int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj,
int versioning_status, uint16_t bilog_flags)
{
RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
if (conds.mod_ptr) {
ldout(cct, 10) << "If-Modified-Since: " << *conds.mod_ptr << " Last-Modified: " << ctime << dendl;
- if (ctime < *conds.mod_ptr) {
+ if (ctime <= *conds.mod_ptr) {
return -ERR_NOT_MODIFIED;
}
}
op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
}
-int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, uint64_t olh_epoch)
+int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch)
{
rgw_rados_ref ref;
rgw_bucket bucket;
}
cls_rgw_obj_key key(obj_instance.get_index_key_name(), obj_instance.get_instance());
- ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_epoch, zone_public_config.log_data);
+ ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_tag, olh_epoch, zone_public_config.log_data);
if (ret < 0) {
return ret;
}
return ret;
}
- ret = bucket_index_unlink_instance(target_obj, op_tag, olh_epoch);
+ string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
+
+ ret = bucket_index_unlink_instance(target_obj, op_tag, olh_tag, olh_epoch);
if (ret < 0) {
ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " returned " << ret << dendl;
if (ret == -ECANCELED) {
map<string, bufferlist> updates;
uint32_t count = 0;
while (count < num_entries && !candidates.empty()) {
+ r = 0;
// Select the next one
int pos = candidates.begin()->second;
const string& name = vcurrents[pos]->first;
}
}
+int RGWRados::delete_obj_aio(rgw_obj& obj, rgw_bucket& bucket,
+ RGWBucketInfo& bucket_info, RGWObjState *astate,
+ list<librados::AioCompletion *>& handles, bool keep_index_consistent)
+{
+ rgw_rados_ref ref;
+ int ret = get_obj_ref(obj, &ref, &bucket);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (keep_index_consistent) {
+ RGWRados::Bucket bop(this, bucket_info.bucket);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj, astate);
+
+ ret = index_op.prepare(CLS_RGW_OP_DEL);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+
+ ObjectWriteOperation op;
+ list<string> prefixes;
+ cls_rgw_remove_obj(op, prefixes);
+
+ AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
+ ret = ref.ioctx.aio_operate(ref.oid, c, &op);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+ return ret;
+ }
+
+ handles.push_back(c);
+
+ if (keep_index_consistent) {
+ ret = delete_obj_index(obj);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ return ret;
+}
+
int complete_atomic_modification();
public:
- Object(RGWRados *_store, RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
+ Object(RGWRados *_store, RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
ctx(_ctx), obj(_obj), bs(store),
state(NULL), versioning_disabled(false),
bs_initialized(false) {}
const char *if_match;
const char *if_nomatch;
uint64_t olh_epoch;
+ bool canceled;
MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
remove_objs(NULL), set_mtime(0), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
- if_match(NULL), if_nomatch(NULL), olh_epoch(0) {}
+ if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false) {}
} meta;
Write(RGWRados::Object *_target) : target(_target) {}
const char *if_match,
const char *if_nomatch,
AttrsMod attrs_mod,
+ bool copy_if_newer,
map<string, bufferlist>& attrs,
RGWObjCategory category,
uint64_t olh_epoch,
const char *if_match,
const char *if_nomatch,
AttrsMod attrs_mod,
+ bool copy_if_newer,
map<std::string, bufferlist>& attrs,
RGWObjCategory category,
uint64_t olh_epoch,
int bucket_suspended(rgw_bucket& bucket, bool *suspended);
/** Delete an object.*/
- virtual int delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_owner, rgw_obj& src_obj,
+ virtual int delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_owner, const rgw_obj& src_obj,
int versioning_status, uint16_t bilog_flags = 0);
/* Delete a system object */
int bucket_index_link_olh(RGWObjState& olh_state, rgw_obj& obj_instance, bool delete_marker,
const string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
uint64_t olh_epoch);
- int bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, uint64_t olh_epoch);
+ int bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch);
int bucket_index_read_olh_log(RGWObjState& state, rgw_obj& obj_instance, uint64_t ver_marker,
map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
int bucket_index_trim_olh_log(RGWObjState& obj_state, rgw_obj& obj_instance, uint64_t ver);
librados::Rados* get_rados_handle();
+ int delete_obj_aio(rgw_obj& obj, rgw_bucket& bucket, RGWBucketInfo& info, RGWObjState *astate,
+ list<librados::AioCompletion *>& handles, bool keep_index_consistent);
private:
/**
* This is a helper method, it generates a list of bucket index objects with the given
RGWObjectCtx& obj_ctx;
bool is_complete;
RGWBucketInfo bucket_info;
+ bool canceled;
virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime,
map<string, bufferlist>& attrs,
const char *if_match = NULL, const char *if_nomatch = NULL) = 0;
public:
- RGWPutObjProcessor(RGWObjectCtx& _obj_ctx, RGWBucketInfo& _bi) : store(NULL), obj_ctx(_obj_ctx), is_complete(false), bucket_info(_bi) {}
+ RGWPutObjProcessor(RGWObjectCtx& _obj_ctx, RGWBucketInfo& _bi) : store(NULL), obj_ctx(_obj_ctx), is_complete(false), bucket_info(_bi), canceled(false) {}
virtual ~RGWPutObjProcessor() {}
virtual int prepare(RGWRados *_store, string *oid_rand) {
store = _store;
return 0;
}
- virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again) = 0;
- virtual int throttle_data(void *handle, bool need_to_wait) = 0;
+ virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, rgw_obj *pobj, bool *again) = 0;
+ virtual int throttle_data(void *handle, const rgw_obj& obj, bool need_to_wait) = 0;
virtual void complete_hash(MD5 *hash) {
assert(0);
}
const char *if_match = NULL, const char *if_nomatch = NULL);
CephContext *ctx();
+
+ bool is_canceled() { return canceled; }
};
struct put_obj_aio_info {
void *handle;
+ rgw_obj obj;
};
class RGWPutObjProcessor_Aio : public RGWPutObjProcessor
protected:
uint64_t obj_len;
- list<rgw_obj> written_objs;
+ set<rgw_obj> written_objs;
void add_written_obj(const rgw_obj& obj) {
- written_objs.push_back(obj);
+ written_objs.insert(obj);
}
int drain_pending();
int handle_obj_data(rgw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive);
public:
- int throttle_data(void *handle, bool need_to_wait);
+ int throttle_data(void *handle, const rgw_obj& obj, bool need_to_wait);
RGWPutObjProcessor_Aio(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info) : RGWPutObjProcessor(obj_ctx, bucket_info), max_chunks(RGW_MAX_PENDING_CHUNKS), obj_len(0) {}
virtual ~RGWPutObjProcessor_Aio();
RGWObjManifest manifest;
RGWObjManifest::generator manifest_gen;
- int write_data(bufferlist& bl, off_t ofs, void **phandle, bool exclusive);
+ int write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_obj *pobj, bool exclusive);
virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime,
map<string, bufferlist>& attrs,
const char *if_match = NULL, const char *if_nomatch = NULL);
void set_extra_data_len(uint64_t len) {
extra_data_len = len;
}
- virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again);
+ virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, rgw_obj *pobj, bool *again);
virtual void complete_hash(MD5 *hash);
bufferlist& get_extra_data() { return extra_data_bl; }
return false;
}
-static void dump_status(struct req_state *s, const char *status, const char *status_name)
+static void dump_status(struct req_state *s, int status, const char *status_name)
{
int r = s->cio->send_status(status, status_name);
if (r < 0) {
void dump_errno(struct req_state *s)
{
- char buf[32];
- snprintf(buf, sizeof(buf), "%d", s->err.http_ret);
- dump_status(s, buf, http_status_names[s->err.http_ret]);
+ dump_status(s, s->err.http_ret, http_status_names[s->err.http_ret]);
}
-void dump_errno(struct req_state *s, int err)
+void dump_errno(struct req_state *s, int http_ret)
{
- char buf[32];
- snprintf(buf, sizeof(buf), "%d", err);
- dump_status(s, buf, http_status_names[s->err.http_ret]);
+ dump_status(s, http_ret, http_status_names[http_ret]);
}
void dump_string_header(struct req_state *s, const char *name, const char *val)
const char *hdr, const char *exp_hdr, uint32_t max_age) {
if (origin && (origin[0] != '\0')) {
s->cio->print("Access-Control-Allow-Origin: %s\r\n", origin);
+
+ /* If the server specifies an origin host rather than "*",
+ * then it must also include Origin in the Vary response header
+ * to indicate to clients that server responses will differ
+ * based on the value of the Origin request header.
+ */
+ if (strcmp(origin, "*") != 0)
+ s->cio->print("Vary: Origin\r\n");
+
if (meth && (meth[0] != '\0'))
s->cio->print("Access-Control-Allow-Methods: %s\r\n", meth);
if (hdr && (hdr[0] != '\0'))
case OP_POST:
case OP_COPY:
/* is it a 'multi-object delete' request? */
- if (s->info.request_params == "delete") {
+ if (s->info.args.exists("delete")) {
only_bucket = true;
break;
}
return -EINVAL;
}
- decode_json_obj(out, &parser);
-
free(data);
+
+ try {
+ decode_json_obj(out, &parser);
+ } catch (JSONDecoder::err& e) {
+ return -EINVAL;
+ }
+
return 0;
}
return ret;
}
-int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata,
- RGWGetDataCB *cb, RGWRESTStreamReadRequest **req)
+static void set_date_header(const time_t *t, map<string, string>& headers, const string& header_name)
+{
+ if (!t) {
+ return;
+ }
+ stringstream s;
+ utime_t tm = utime_t(*t, 0);
+ tm.asctime(s);
+ headers["HTTP_IF_MODIFIED_SINCE"] = s.str();
+}
+
+
+int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj,
+ const time_t *mod_ptr, const time_t *unmod_ptr,
+ bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req)
{
string url;
int ret = get_url(url);
extra_headers[iter->first] = iter->second;
}
}
+
+ set_date_header(mod_ptr, extra_headers, "HTTP_IF_MODIFIED_SINCE");
+ set_date_header(unmod_ptr, extra_headers, "HTTP_IF_UNMODIFIED_SINCE");
+
return (*req)->get_obj(key, extra_headers, obj);
}
map<string, bufferlist>& attrs, RGWRESTStreamWriteRequest **req);
int complete_request(RGWRESTStreamWriteRequest *req, string& etag, time_t *mtime);
- int get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req);
+ int get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj,
+ const time_t *mod_ptr, const time_t *unmod_ptr,
+ bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req);
int complete_request(RGWRESTStreamReadRequest *req, string& etag, time_t *mtime, map<string, string>& attrs);
};
s->cio->print("%s: %s\r\n", riter->first.c_str(), riter->second.c_str());
}
- if (!content_type)
- content_type = "binary/octet-stream";
+ if (ret == -ERR_NOT_MODIFIED) {
+ end_header(s, this);
+ } else {
+ if (!content_type)
+ content_type = "binary/octet-stream";
- end_header(s, this, content_type);
+ end_header(s, this, content_type);
+ }
if (metadata_bl.length()) {
s->cio->write(metadata_bl.c_str(), metadata_bl.length());
s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false"));
bool encode_key = false;
- if (strcasecmp(encoding_type.c_str(), "url") == 0)
+ if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+ s->formatter->dump_string("EncodingType", "url");
encode_key = true;
+ }
if (ret >= 0) {
vector<RGWObjEnt>::iterator iter;
s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false"));
bool encode_key = false;
- if (strcasecmp(encoding_type.c_str(), "url") == 0)
+ if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+ s->formatter->dump_string("EncodingType", "url");
encode_key = true;
+ }
if (ret >= 0) {
vector<RGWObjEnt>::iterator iter;
if (s->system_request) {
source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone");
+ s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "copy-if-newer", ©_if_newer, false);
if (!source_zone.empty()) {
client_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "client-id");
op_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "op-id");
}
s->formatter->dump_unsigned("PartNumber", info.num);
- s->formatter->dump_string("ETag", info.etag);
+ s->formatter->dump_format("ETag", "\"%s\"", info.etag.c_str());
s->formatter->dump_unsigned("Size", info.size);
s->formatter->close_section();
}
RGWOp *RGWHandler_ObjStore_Bucket_S3::op_post()
{
- if ( s->info.request_params == "delete" ) {
+ if (s->info.args.exists("delete")) {
return new RGWDeleteMultiObj_ObjStore_S3;
}
return 0;
}
+// remove all keys associated with a subuser
+int RGWAccessKeyPool::remove_subuser_keys(RGWUserAdminOpState& op_state,
+ std::string *err_msg, bool defer_user_update)
+{
+ int ret = 0;
+
+ if (!op_state.is_populated()) {
+ set_err_msg(err_msg, "user info was not populated");
+ return -EINVAL;
+ }
+
+ if (!op_state.has_subuser()) {
+ set_err_msg(err_msg, "no subuser specified");
+ return -EINVAL;
+ }
+
+ std::string swift_kid = op_state.build_default_swift_kid();
+ if (swift_kid.empty()) {
+ set_err_msg(err_msg, "empty swift access key");
+ return -EINVAL;
+ }
+
+ map<std::string, RGWAccessKey>::iterator kiter;
+ map<std::string, RGWAccessKey> *keys_map;
+
+ // a subuser can have at most one swift key
+ keys_map = swift_keys;
+ kiter = keys_map->find(swift_kid);
+ if (kiter != keys_map->end()) {
+ rgw_remove_key_index(store, kiter->second);
+ keys_map->erase(kiter);
+ }
+
+ // a subuser may have multiple s3 key pairs
+ std::string subuser_str = op_state.get_subuser();
+ keys_map = access_keys;
+ RGWUserInfo user_info = op_state.get_user_info();
+ map<std::string, RGWAccessKey>::iterator user_kiter = user_info.access_keys.begin();
+ for (; user_kiter != user_info.access_keys.end(); ++user_kiter) {
+ if (user_kiter->second.subuser == subuser_str) {
+ kiter = keys_map->find(user_kiter->first);
+ if (kiter != keys_map->end()) {
+ rgw_remove_key_index(store, kiter->second);
+ keys_map->erase(kiter);
+ }
+ }
+ }
+
+ if (!defer_user_update)
+ ret = user->update(op_state, err_msg);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
RGWSubUserPool::RGWSubUserPool(RGWUser *usr)
{
subusers_allowed = (usr != NULL);
map<std::string, RGWSubUser>::iterator siter;
siter = subuser_map->find(subuser_str);
-
+ if (siter == subuser_map->end()){
+ set_err_msg(err_msg, "subuser not found: " + subuser_str);
+ return -EINVAL;
+ }
if (!op_state.has_existing_subuser()) {
set_err_msg(err_msg, "subuser not found: " + subuser_str);
return -EINVAL;
}
- if (op_state.will_purge_keys()) {
- // error would be non-existance so don't check
- user->keys.remove(op_state, &subprocess_msg, true);
- }
+ // always purge all associate keys
+ user->keys.remove_subuser_keys(op_state, &subprocess_msg, true);
- //remove the subuser from the user info
+ // remove the subuser from the user info
subuser_map->erase(siter);
// attempt to save the subuser
time_t mtime, JSONObj *obj, sync_type_t sync_mode) {
RGWUserInfo info;
- decode_json_obj(info, obj);
+ try {
+ decode_json_obj(info, obj);
+ } catch (JSONDecoder::err& e) {
+ return -EINVAL;
+ }
RGWUserInfo old_info;
time_t orig_mtime;
/* API Contract Fulfilment */
int execute_add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save);
int execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save);
+ int remove_subuser_keys(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save);
int add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save);
int remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save);
AC_CHECK_LIB([z], [gzread], [HAVE_LIBZ=yes], [AC_MSG_FAILURE([libz not found])])
AC_CHECK_LIB([bz2], [BZ2_bzCompressInit], [HAVE_LIBBZ2=yes], [AC_MSG_FAILURE([libbz2 not found])])
AC_CHECK_LIB([rt], [clock_gettime], [HAVE_LIBRT=yes], [AC_MSG_FAILURE([librt not found])])
-AC_CHECK_LIB([tcmalloc], [malloc], [HAVE_LIBTCMALLOC=yes],[AC_MSG_FAILURE([no tcmalloc found ])])
+AC_ARG_WITH([tcmalloc],
+ [AS_HELP_STRING([--without-tcmalloc], [disable tcmalloc for memory allocations])],
+ [],
+ [AC_CHECK_LIB([tcmalloc],
+ [malloc],
+ [HAVE_LIBTCMALLOC=yes],
+ [AC_MSG_FAILURE([no tcmalloc found ])])])
OLD_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS -std=c++11"
test/mon/osd-erasure-code-profile.sh \
test/mon/mkfs.sh \
test/osd/osd-scrub-repair.sh \
+ test/osd/osd-scrub-snaps.sh \
test/osd/osd-config.sh \
test/osd/osd-bench.sh \
test/osd/osd-copy-from.sh \
{
bufferlist bl;
bufferptr ptr(buffer::create_page_aligned(2));
+ ptr[0] = 'X';
+ ptr[1] = 'Y';
ptr.set_offset(1);
ptr.set_length(1);
bl.append(ptr);
EXPECT_FALSE(bl.is_page_aligned());
bl.rebuild();
- EXPECT_FALSE(bl.is_page_aligned());
+ EXPECT_EQ(1U, bl.length());
+ EXPECT_EQ('Y', *bl.begin());
}
{
bufferlist bl;
%bcond_with ocf
%bcond_without cephfs_java
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
+
%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
Name: ceph
Version: @VERSION@
Release: @RPM_RELEASE@%{?dist}
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-%else
+Requires: lsb-release
+BuildRequires: lsb-release
+%endif
+%if 0%{?fedora} || 0%{?rhel}
Requires: gdisk
BuildRequires: nss-devel
BuildRequires: keyutils-libs-devel
Requires(preun):initscripts
BuildRequires: gperftools-devel
Requires: python-flask
+Requires: redhat-lsb-core
+BuildRequires: redhat-lsb-core
%endif
# lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
%if 0%{?fedora} || 0%{?rhel}
BuildRequires: lttng-ust-devel
BuildRequires: libbabeltrace-devel
%endif
./autogen.sh
-MY_CONF_OPT=""
+MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS"
MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
+%if %{without lttng}
+ --without-lttng \
+ --without-babeltrace \
+%endif
%{?_with_ocf} \
CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/libos_tp.so*
%{_libdir}/libosd_tp.so*
%endif
%{_bindir}/rbd
%{_bindir}/rbd-replay
%{_bindir}/rbd-replay-many
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_bindir}/rbd-replay-prep
%endif
%{_bindir}/ceph-post-file
%files -n librados2
%defattr(-,root,root,-)
%{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so.*
%endif
%{_includedir}/rados/rados_types.hpp
%{_includedir}/rados/memory.h
%{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so
%endif
%files -n librbd1
%defattr(-,root,root,-)
%{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so.*
%endif
%{_includedir}/rbd/librbd.hpp
%{_includedir}/rbd/features.h
%{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so
%endif
%bcond_with ocf
%bcond_without cephfs_java
+# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315
+%bcond_without lttng
+%endif
+
%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
-# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
-%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
-%global _with_lttng 1
-%endif
-
Name: ceph
Version: @VERSION@
Release: @RPM_RELEASE@%{?dist}
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-%else
+Requires: lsb-release
+BuildRequires: lsb-release
+%endif
+%if 0%{?fedora} || 0%{?rhel}
Requires: gdisk
BuildRequires: nss-devel
BuildRequires: keyutils-libs-devel
Requires(preun):initscripts
BuildRequires: gperftools-devel
Requires: python-flask
+Requires: redhat-lsb-core
+BuildRequires: redhat-lsb-core
%endif
# lttng and babeltrace for rbd-replay-prep
-%if 0%{?_with_lttng}
+%if %{with lttng}
%if 0%{?fedora} || 0%{?rhel}
BuildRequires: lttng-ust-devel
BuildRequires: libbabeltrace-devel
%endif
./autogen.sh
-MY_CONF_OPT=""
+MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS"
MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
+%if %{without lttng}
+ --without-lttng \
+ --without-babeltrace \
+%endif
%{?_with_ocf} \
CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/libos_tp.so*
%{_libdir}/libosd_tp.so*
%endif
%{_bindir}/rbd
%{_bindir}/rbd-replay
%{_bindir}/rbd-replay-many
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_bindir}/rbd-replay-prep
%endif
%{_bindir}/ceph-post-file
%files -n librados2
%defattr(-,root,root,-)
%{_libdir}/librados.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so.*
%endif
%{_includedir}/rados/rados_types.hpp
%{_includedir}/rados/memory.h
%{_libdir}/librados.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librados_tp.so
%endif
%files -n librbd1
%defattr(-,root,root,-)
%{_libdir}/librbd.so.*
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so.*
%endif
%{_includedir}/rbd/librbd.hpp
%{_includedir}/rbd/features.h
%{_libdir}/librbd.so
-%if 0%{?_with_lttng}
+%if %{with lttng}
%{_libdir}/librbd_tp.so
%endif
import subprocess
# backported from python 2.7 stdlib
process = subprocess.Popen(
- stdout=subprocess.PIPE, *popenargs, **kwargs)
+ stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
raise error
return output
-import subprocess
+import filecmp
import os
+import subprocess
+try:
+ from subprocess import DEVNULL
+except ImportError:
+ subprocess.DEVNULL = open(os.devnull, "w")
+
+import math
import time
import sys
import re
import string
import logging
import json
+import tempfile
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
def wait_for_health():
print "Wait for health_ok...",
+ tries = 0
while call("./ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null", shell=True) == 0:
+ if ++tries == 30:
+ raise Exception("Time exceeded to go to health")
time.sleep(5)
print "DONE"
if ID:
endhead = re.compile("{id}.*_head$".format(id=ID))
DIR = os.path.join(SUBDIR, "current")
- PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID == None or endhead.match(f))]
+ PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))]
PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p]
return PGS
continue
FINALDIR = os.path.join(SUBDIR, PGDIR)
# See if there are any objects there
- if [ f for f in [ val for _, _, fl in os.walk(FINALDIR) for val in fl ] if string.find(f, prefix) == 0 ]:
+ if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)):
PGS += [p]
return sorted(set(PGS))
print "<EOF>"
-def vstart(new):
+def vstart(new, opt=""):
print "vstarting....",
- OPT = new and "-n" or ""
- call("MON=1 OSD=4 CEPH_PORT=7400 ./vstart.sh -l {opt} -d mon osd > /dev/null 2>&1".format(opt=OPT), shell=True)
+ NEW = new and "-n" or ""
+ call("MON=1 OSD=4 CEPH_PORT=7400 ./vstart.sh -l {new} -d mon osd {opt} > /dev/null 2>&1".format(new=NEW, opt=opt), shell=True)
print "DONE"
-def test_failure_tty(cmd, errmsg):
- try:
- ttyfd = open("/dev/tty", "rw")
- except Exception, e:
- logging.info(str(e))
- logging.info("SKIP " + cmd)
- return 0
+
+def test_failure(cmd, errmsg, tty=False):
+ if tty:
+ try:
+ ttyfd = open("/dev/tty", "rw")
+ except Exception, e:
+ logging.info(str(e))
+ logging.info("SKIP " + cmd)
+ return 0
TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
tmpfd = open(TMPFILE, "w")
logging.debug(cmd)
- ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
- ttyfd.close()
+ if tty:
+ ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
+ ttyfd.close()
+ else:
+ ret = call(cmd, shell=True, stderr=tmpfd)
tmpfd.close()
if ret == 0:
+ logging.error(cmd)
logging.error("Should have failed, but got exit 0")
return 1
lines = get_lines(TMPFILE)
- line = lines[0]
- if line == errmsg:
- logging.info("Correctly failed with message \"" + line + "\"")
+ matched = [ l for l in lines if errmsg in l ]
+ if any(matched):
+ logging.info("Correctly failed with message \"" + matched[0] + "\"")
return 0
else:
- logging.error("Bad message to stderr \"" + line + "\"")
+ logging.error("Bad messages to stderr \"" + str(lines) + "\"")
return 1
-def test_failure(cmd, errmsg):
- logging.debug(cmd)
- try:
- out = check_output(cmd, stderr=subprocess.STDOUT, shell=True)
- logging.error("Should have failed, but got exit 0")
- return 1
- except subprocess.CalledProcessError, e:
- if errmsg in e.output:
- logging.info("Correctly failed with message \"" + errmsg + "\"")
- return 0
- else:
- logging.error("Bad message to stderr \"" + e.output + "\"")
- return 1
def get_nspace(num):
if num == 0:
return "ns{num}".format(num=num)
-def verify(DATADIR, POOL, NAME_PREFIX):
+def verify(DATADIR, POOL, NAME_PREFIX, db):
TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
nullfd = open(os.devnull, "w")
ERRORS = 0
- for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
+ for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
+ nsfile = rawnsfile.split("__")[0]
+ clone = rawnsfile.split("__")[1]
nspace = nsfile.split("-")[0]
file = nsfile.split("-")[1]
- path = os.path.join(DATADIR, nsfile)
+ # Skip clones
+ if clone != "head":
+ continue
+ path = os.path.join(DATADIR, rawnsfile)
try:
os.unlink(TMPFILE)
except:
os.unlink(TMPFILE)
except:
pass
+ for key, val in db[nspace][file]["xattr"].iteritems():
+ cmd = "./rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True, stderr=nullfd)
+ logging.debug("getxattr {key} {val}".format(key=key, val=getval))
+ if getval != val:
+ logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val))
+ ERRORS += 1
+ continue
+ hdr = db[nspace][file].get("omapheader", "")
+ cmd = "./rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.error("rados getomapheader returned {ret}".format(ret=ret))
+ ERRORS += 1
+ else:
+ getlines = get_lines(TMPFILE)
+ assert(len(getlines) == 0 or len(getlines) == 1)
+ if len(getlines) == 0:
+ gethdr = ""
+ else:
+ gethdr = getlines[0]
+ logging.debug("header: {hdr}".format(hdr=gethdr))
+ if gethdr != hdr:
+ logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+ ERRORS += 1
+ for key, val in db[nspace][file]["omap"].iteritems():
+ cmd = "./rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.error("getomapval returned {ret}".format(ret=ret))
+ ERRORS += 1
+ continue
+ getlines = get_lines(TMPFILE)
+ if len(getlines) != 1:
+ logging.error("Bad data from getomapval {lines}".format(lines=getlines))
+ ERRORS += 1
+ continue
+ getval = getlines[0]
+ logging.debug("getomapval {key} {val}".format(key=key, val=getval))
+ if getval != val:
+ logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val))
+ ERRORS += 1
+ try:
+ os.unlink(TMPFILE)
+ except:
+ pass
+ return ERRORS
+
+
+def check_journal(jsondict):
+ errors = 0
+ if 'header' not in jsondict:
+ logging.error("Key 'header' not in dump-journal")
+ errors += 1
+ elif 'max_size' not in jsondict['header']:
+ logging.error("Key 'max_size' not in dump-journal header")
+ errors += 1
+ else:
+ print "\tJournal max_size = {size}".format(size=jsondict['header']['max_size'])
+ if 'entries' not in jsondict:
+ logging.error("Key 'entries' not in dump-journal output")
+ errors += 1
+ elif len(jsondict['entries']) == 0:
+ logging.info("No entries in journal found")
+ else:
+ errors += check_journal_entries(jsondict['entries'])
+ return errors
+
+
+def check_journal_entries(entries):
+ errors = 0
+ for enum in range(len(entries)):
+ if 'offset' not in entries[enum]:
+ logging.error("No 'offset' key in entry {e}".format(e=enum))
+ errors += 1
+ if 'seq' not in entries[enum]:
+ logging.error("No 'seq' key in entry {e}".format(e=enum))
+ errors += 1
+ if 'transactions' not in entries[enum]:
+ logging.error("No 'transactions' key in entry {e}".format(e=enum))
+ errors += 1
+ elif len(entries[enum]['transactions']) == 0:
+ logging.error("No transactions found in entry {e}".format(e=enum))
+ errors += 1
+ else:
+ errors += check_entry_transactions(entries[enum], enum)
+ return errors
+
+
+def check_entry_transactions(entry, enum):
+ errors = 0
+ for tnum in range(len(entry['transactions'])):
+ if 'trans_num' not in entry['transactions'][tnum]:
+ logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+ errors += 1
+ elif entry['transactions'][tnum]['trans_num'] != tnum:
+ ft = entry['transactions'][tnum]['trans_num']
+ logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum))
+ errors += 1
+ if 'ops' not in entry['transactions'][tnum]:
+ logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+ errors += 1
+ else:
+ errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum)
+ return errors
+
+
+def check_transaction_ops(ops, enum, tnum):
+ if len(ops) is 0:
+ logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum))
+ errors = 0
+ for onum in range(len(ops)):
+ if 'op_num' not in ops[onum]:
+ logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+ errors += 1
+ elif ops[onum]['op_num'] != onum:
+ fo = ops[onum]['op_num']
+ logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum))
+ errors += 1
+ if 'op_name' not in ops[onum]:
+ logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+ errors += 1
+ return errors
+
+
+def test_dump_journal(CFSD_PREFIX, osds):
+ ERRORS = 0
+ pid = os.getpid()
+ TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
+
+ for osd in osds:
+ # Test --op dump-journal by loading json
+ cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd)
+ logging.debug(cmd)
+ tmpfd = open(TMPFILE, "w")
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+ ERRORS += 1
+ continue
+ tmpfd.close()
+ tmpfd = open(TMPFILE, "r")
+ jsondict = json.load(tmpfd)
+ tmpfd.close()
+ os.unlink(TMPFILE)
+
+ journal_errors = check_journal(jsondict)
+ if journal_errors is not 0:
+ logging.error(jsondict)
+ ERRORS += journal_errors
+
return ERRORS
+
CEPH_DIR = "ceph_objectstore_tool_dir"
CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
+
def kill_daemons():
call("./init-ceph -c {conf} stop osd mon > /dev/null 2>&1".format(conf=CEPH_CONF), shell=True)
+
+def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
+ repcount = 0
+ ERRORS = 0
+ for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
+ nsfile = rawnsfile.split("__")[0]
+ clone = rawnsfile.split("__")[1]
+ nspace = nsfile.split("-")[0]
+ file = nsfile.split("-")[1] + "__" + clone
+ # Skip clones
+ if clone != "head":
+ continue
+ path = os.path.join(DATADIR, rawnsfile)
+ tmpfd = open(TMPFILE, "w")
+ cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret:
+ logging.critical("INTERNAL ERROR")
+ return 1
+ tmpfd.close()
+ obj_locs = get_lines(TMPFILE)
+ if len(obj_locs) == 0:
+ logging.error("Can't find imported object {name}".format(name=file))
+ ERRORS += 1
+ for obj_loc in obj_locs:
+ repcount += 1
+ cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
+ ERRORS += 1
+ return ERRORS, repcount
+
+
+def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+ # change the weight of osd.0 to math.pi in the newest osdmap of given osd
+ osdmap_file = tempfile.NamedTemporaryFile()
+ cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+ osdmap_file=osdmap_file.name)
+ output = check_output(cmd, shell=True)
+ epoch = int(re.findall('#(\d+)', output)[0])
+
+ new_crush_file = tempfile.NamedTemporaryFile(delete=False)
+ old_crush_file = tempfile.NamedTemporaryFile(delete=False)
+ ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+ crush_file=old_crush_file.name),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ shell=True)
+ assert(ret == 0)
+
+ for osd_id in osd_ids:
+ cmd = "./crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id,
+ crush_file=old_crush_file.name,
+ weight=weight,
+ new_crush_file=new_crush_file.name)
+ ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+ assert(ret == 0)
+ old_crush_file, new_crush_file = new_crush_file, old_crush_file
+
+ # change them back, since we don't need to preapre for another round
+ old_crush_file, new_crush_file = new_crush_file, old_crush_file
+ old_crush_file.close()
+
+ ret = call("./osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+ crush_file=new_crush_file.name),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ shell=True)
+ assert(ret == 0)
+
+ # Minimum test of --dry-run by using it, but not checking anything
+ cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run"
+ cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+ ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+ assert(ret == 0)
+
+ # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
+ # to use use a different epoch than the one in osdmap
+ cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
+ cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+ ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+ return ret == 0
+
+def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
+ osdmap_file = tempfile.NamedTemporaryFile()
+ cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+ osdmap_file=osdmap_file.name)
+ ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+ if ret != 0:
+ return None
+ # we have to read the weights from the crush map, even we can query the weights using
+ # osdmaptool, but please keep in mind, they are different:
+ # item weights in crush map versus weight associated with each osd in osdmap
+ crush_file = tempfile.NamedTemporaryFile(delete=False)
+ ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+ crush_file=crush_file.name),
+ stdout=subprocess.DEVNULL,
+ shell=True)
+ assert(ret == 0)
+ output = check_output("./crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name,
+ num_osd=len(osd_ids)),
+ stderr=subprocess.DEVNULL,
+ shell=True)
+ weights = []
+ for line in output.strip().split('\n'):
+ osd_id, weight, osd_name = re.split('\s+', line)
+ weights.append(float(weight))
+ return weights
+
+
+def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths):
+ print "Testing get-osdmap and set-osdmap"
+ errors = 0
+ kill_daemons()
+ weight = 1 / math.e # just some magic number in [0, 1]
+ changed = []
+ for osd_path in osd_paths:
+ if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+ changed.append(osd_path)
+ else:
+ logging.warning("Failed to change the weights: {0}".format(osd_path))
+ # i am pissed off if none of the store gets changed
+ if not changed:
+ errors += 1
+
+ for osd_path in changed:
+ weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path)
+ if not weights:
+ errors += 1
+ continue
+ if any(abs(w - weight) > 1e-5 for w in weights):
+ logging.warning("Weight is not changed: {0} != {1}".format(weights, weight))
+ errors += 1
+ return errors
+
+def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
+ # incrementals are not used unless we need to build an MOSDMap to update
+ # OSD's peers, so an obvious way to test it is simply overwrite an epoch
+ # with a different copy, and read it back to see if it matches.
+ kill_daemons()
+ file_e2 = tempfile.NamedTemporaryFile()
+ cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path,
+ file=file_e2.name)
+ output = check_output(cmd, shell=True)
+ epoch = int(re.findall('#(\d+)', output)[0])
+ # backup e1 incremental before overwriting it
+ epoch -= 1
+ file_e1_backup = tempfile.NamedTemporaryFile()
+ cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+ if ret: return 1
+ # overwrite e1 with e2
+ cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
+ if ret: return 1
+ # Use dry-run to set back to e1 which shouldn't happen
+ cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+ if ret: return 1
+ # read from e1
+ file_e1_read = tempfile.NamedTemporaryFile(delete=False)
+ cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True)
+ if ret: return 1
+ errors = 0
+ try:
+ if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False):
+ logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name))
+ errors += 1
+ finally:
+ # revert the change with file_e1_backup
+ cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+ if ret:
+ logging.error("Failed to revert the changed inc-osdmap")
+ errors += 1
+ return errors
+
+
def main(argv):
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
- nullfd = open(os.devnull, "w")
+ if len(argv) > 1 and argv[1] == "debug":
+ nullfd = sys.stdout
+ else:
+ nullfd = open(os.devnull, "w")
- call("rm -fr ceph_objectstore_tool_dir ; mkdir ceph_objectstore_tool_dir", shell=True)
- os.environ["CEPH_DIR"] = CEPH_DIR;
+ call("rm -fr {dir}; mkdir {dir}".format(dir=CEPH_DIR), shell=True)
+ os.environ["CEPH_DIR"] = CEPH_DIR
OSDDIR = os.path.join(CEPH_DIR, "dev")
REP_POOL = "rep_pool"
REP_NAME = "REPobject"
NAME = REP_NAME + "{num}".format(num=i)
LNAME = nspace + "-" + NAME
DDNAME = os.path.join(DATADIR, LNAME)
+ DDNAME += "__head"
cmd = "rm -f " + DDNAME
logging.debug(cmd)
logging.debug(cmd)
ret = call(cmd, shell=True, stderr=nullfd)
if ret != 0:
- logging.critical("Replicated pool object creation failed with {ret}".format(ret=ret))
+ logging.critical("Rados put command failed with {ret}".format(ret=ret))
return 1
db[nspace][NAME] = {}
logging.critical("setomapval failed with {ret}".format(ret=ret))
db[nspace][NAME]["omap"][mykey] = myval
+ # Create some clones
+ cmd = "./rados -p {pool} mksnap snap1".format(pool=REP_POOL)
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ objects = range(1, NUM_REP_OBJECTS + 1)
+ nspaces = range(NUM_NSPACES)
+ for n in nspaces:
+ nspace = get_nspace(n)
+
+ for i in objects:
+ NAME = REP_NAME + "{num}".format(num=i)
+ LNAME = nspace + "-" + NAME
+ DDNAME = os.path.join(DATADIR, LNAME)
+ # First clone
+ CLONENAME = DDNAME + "__1"
+ DDNAME += "__head"
+
+ cmd = "mv -f " + DDNAME + " " + CLONENAME
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ if i == 1:
+ dataline = range(DATALINECOUNT)
+ else:
+ dataline = range(1)
+ fd = open(DDNAME, "w")
+ data = "This is the replicated data after a snapshot for " + LNAME + "\n"
+ for _ in dataline:
+ fd.write(data)
+ fd.close()
+
+ cmd = "./rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.critical("Rados put command failed with {ret}".format(ret=ret))
+ return 1
+
print "Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES))
objects = range(1, NUM_EC_OBJECTS + 1)
NAME = EC_NAME + "{num}".format(num=i)
LNAME = nspace + "-" + NAME
DDNAME = os.path.join(DATADIR, LNAME)
+ DDNAME += "__head"
cmd = "rm -f " + DDNAME
logging.debug(cmd)
print "Test invalid parameters"
# On export can't use stdout to a terminal
cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified")
+ ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
# On export can't use stdout to a terminal
cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified")
+ ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
+ # Prep a valid ec export file for import failure tests
+ ONEECPG = ALLECPGS[0]
+ osds = get_osds(ONEECPG, OSDDIR)
+ ONEECOSD = osds[0]
OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
- foofd = open(OTHERFILE, "w")
- foofd.close()
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- # On import can't specify a PG
- cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {FOO}").format(osd=ONEOSD, pg=ONEPG, FOO=OTHERFILE)
- ERRORS += test_failure(cmd, "--pgid option invalid with import")
+ # On import can't specify a different shard
+ BADPG = ONEECPG.split('s')[0] + "s10"
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Can't specify a different shard, must be")
+
+ os.unlink(OTHERFILE)
+
+ # Prep a valid export file for import failure tests
+ OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+ # On import can't specify a PG with a non-existent pool
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be")
+
+ # On import can't specify shard for a replicated export
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export")
+
+ # On import can't specify a PG with a bad seed
+ TMPPG="{pool}.80".format(pool=REPID)
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num")
os.unlink(OTHERFILE)
cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
- ERRORS += test_failure(cmd, "open: No such file or directory")
+ ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE))
+
+ cmd = "./ceph-objectstore-tool --data-path BAD_DATA_PATH --journal-path " + OSDDIR + "/{osd}.journal --op list".format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory")
+
+ cmd = "./ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal"
+ ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
# On import can't use stdin from a terminal
cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified")
+ ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
# On import can't use stdin from a terminal
cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified")
+ ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
# Specify a bad --type
cmd = (CFSD_PREFIX + "--type foobar --op list --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
cmd = "./ceph-objectstore-tool --type filestore --data-path {dir}/{osd} --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG)
ERRORS += test_failure(cmd, "Must provide --journal-path")
- # Test --op list and generate json for all objects
+ cmd = (CFSD_PREFIX + "--op remove").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Must provide pgid")
+
+ # Don't secify a --op nor object command
+ cmd = CFSD_PREFIX.format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Must provide --op or object command...")
+
+ # Specify a bad --op command
+ cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
+
+ # Provide just the object param not a command
+ cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Invalid syntax, missing command")
+
+ # Provide an object name that doesn't exist
+ cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found")
+
+ # Provide an invalid object command
+ cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
+
+ cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified")
+
+ cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array")
+
+ cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements")
+
+ cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements")
+
+ cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements")
+
+ cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string")
+
+ cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4")
+
TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
ALLPGS = OBJREPPGS + OBJECPGS
-
- print "Test --op list variants"
OSDS = get_osds(ALLPGS[0], OSDDIR)
osd = OSDS[0]
+ print "Test all --op dump-journal"
+ ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]
+ ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
+ # Test --op list and generate json for all objects
+ print "Test --op list variants"
+
# retrieve all objects from all PGs
+ tmpfd = open(TMPFILE, "w")
cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd)
- logging.debug(cmd);
- tmpfd = open(TMPFILE, "a")
logging.debug(cmd)
ret = call(cmd, shell=True, stdout=tmpfd)
if ret != 0:
tmpfd.close()
lines = get_lines(TMPFILE)
JSONOBJ = sorted(set(lines))
- (pgid, jsondict) = json.loads(JSONOBJ[0])[0]
+ (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0]
# retrieve all objects in a given PG
- cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
- logging.debug(cmd);
tmpfd = open(OTHERFILE, "a")
+ cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
logging.debug(cmd)
ret = call(cmd, shell=True, stdout=tmpfd)
if ret != 0:
tmpfd.close()
lines = get_lines(OTHERFILE)
JSONOBJ = sorted(set(lines))
- (other_pgid, other_jsondict) = json.loads(JSONOBJ[0])[0]
+ (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0]
- if pgid != other_pgid or jsondict != other_jsondict:
+ if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
logging.error("the first line of --op list is different "
"from the first line of --op list --pgid {pg}".format(pg=pgid))
ERRORS += 1
# retrieve all objects with a given name in a given PG
+ tmpfd = open(OTHERFILE, "w")
cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid'])
- logging.debug(cmd);
- tmpfd = open(OTHERFILE, "a")
logging.debug(cmd)
ret = call(cmd, shell=True, stdout=tmpfd)
if ret != 0:
tmpfd.close()
lines = get_lines(OTHERFILE)
JSONOBJ = sorted(set(lines))
- (other_pgid, other_jsondict) in json.loads(JSONOBJ[0])[0]
+ (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0]
- if pgid != other_pgid or jsondict != other_jsondict:
+ if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
logging.error("the first line of --op list is different "
"from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid']))
ERRORS += 1
for pg in ALLPGS:
OSDS = get_osds(pg, OSDDIR)
for osd in OSDS:
- cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
tmpfd = open(TMPFILE, "a")
+ cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
logging.debug(cmd)
ret = call(cmd, shell=True, stdout=tmpfd)
if ret != 0:
JSONOBJ = sorted(set(lines))
for JSON in JSONOBJ:
(pgid, jsondict) = json.loads(JSON)
+ # Skip clones for now
+ if jsondict['snapid'] != -2:
+ continue
db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict))
# print db[jsondict['namespace']][jsondict['oid']]['json']
if string.find(jsondict['oid'], EC_NAME) == 0 and 'shard_id' not in jsondict:
print "Test get-bytes and set-bytes"
for nspace in db.keys():
for basename in db[nspace].keys():
- file = os.path.join(DATADIR, nspace + "-" + basename)
+ file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
JSON = db[nspace][basename]['json']
GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
if ret != 0:
logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret))
ERRORS += 1
+ fd.close()
try:
os.unlink(GETNAME)
except:
pass
+ # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap
+ print "Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap"
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+ JSON = db[nspace][basename]['json']
+ for pg in OBJREPPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+ for key, val in db[nspace][basename]["xattr"].iteritems():
+ attrkey = "_" + key
+ cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if getval != val:
+ logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val))
+ ERRORS += 1
+ continue
+ # set-attr to bogus value "foobar"
+ cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Test set-attr with dry-run
+ cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check the set-attr
+ cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ if getval != "foobar":
+ logging.error("Check of set-attr failed because we got {val}".format(val=getval))
+ ERRORS += 1
+ continue
+ # Test rm-attr
+ cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check rm-attr with dry-run
+ cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+ if ret == 0:
+ logging.error("For rm-attr expect get-attr to fail, but it succeeded")
+ ERRORS += 1
+ # Put back value
+ cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+
+ hdr = db[nspace][basename].get("omapheader", "")
+ cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ gethdr = check_output(cmd, shell=True)
+ if gethdr != hdr:
+ logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+ ERRORS += 1
+ continue
+ # set-omaphdr to bogus value "foobar"
+ cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check the set-omaphdr
+ cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ gethdr = check_output(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+ if gethdr != "foobar":
+ logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval))
+ ERRORS += 1
+ continue
+ # Test dry-run with set-omaphdr
+ cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Put back value
+ cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+
+ for omapkey, val in db[nspace][basename]["omap"].iteritems():
+ cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if getval != val:
+ logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val))
+ ERRORS += 1
+ continue
+ # set-omap to bogus value "foobar"
+ cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check set-omap with dry-run
+ cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check the set-omap
+ cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+ if getval != "foobar":
+ logging.error("Check of set-omap failed because we got {val}".format(val=getval))
+ ERRORS += 1
+ continue
+ # Test rm-omap
+ cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+ ERRORS += 1
+ # Check rm-omap with dry-run
+ cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+ ERRORS += 1
+ cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+ if ret == 0:
+ logging.error("For rm-omap expect get-omap to fail, but it succeeded")
+ ERRORS += 1
+ # Put back value
+ cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+
+ # Test dump
+ print "Test dump"
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+ JSON = db[nspace][basename]['json']
+ GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+ for pg in OBJREPPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+ cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Invalid dump for {json}".format(json=JSON))
+ ERRORS += 1
+
print "Test list-attrs get-attr"
ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
VALFILE = r"/tmp/val.{pid}".format(pid=pid)
logging.error("Not all keys found, remaining keys:")
print values
+ print "Test --op meta-list"
+ tmpfd = open(TMPFILE, "w")
+ cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret))
+ ERRORS += 1
+
+ print "Test get-bytes on meta"
+ tmpfd.close()
+ lines = get_lines(TMPFILE)
+ JSONOBJ = sorted(set(lines))
+ for JSON in JSONOBJ:
+ (pgid, jsondict) = json.loads(JSON)
+ if pgid != "meta":
+ logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid))
+ ERRORS += 1
+ if jsondict['namespace'] != "":
+ logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace']))
+ ERRORS += 1
+ logging.info(JSON)
+ try:
+ os.unlink(GETNAME)
+ except:
+ pass
+ cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret}".format(ret=ret))
+ ERRORS += 1
+
+ try:
+ os.unlink(GETNAME)
+ except:
+ pass
+ try:
+ os.unlink(TESTNAME)
+ except:
+ pass
+
print "Test pg info"
for pg in ALLREPPGS + ALLECPGS:
for osd in get_osds(pg, OSDDIR):
cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd)
logging.debug(cmd)
TEST_PGS = check_output(cmd, shell=True).split("\n")
- TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line
+ TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line
if TEST_PGS != CHECK_PGS:
logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd))
elif pg == ALLREPPGS[1]:
cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname)
else:
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
logging.debug(cmd)
ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
if ret != 0:
if pg == PGS[0]:
cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd)
elif pg == PGS[1]:
- cmd = (CFSD_PREFIX + "--op import --file - < {file}").format(osd=osd, file=file)
+ cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg)
else:
cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file)
logging.debug(cmd)
if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
print "Verify replicated import data"
- for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(REP_NAME) == 0]:
- nspace = nsfile.split("-")[0]
- file = nsfile.split("-")[1]
- path = os.path.join(DATADIR, nsfile)
- tmpfd = open(TMPFILE, "w")
- cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret:
- logging.critical("INTERNAL ERROR")
- return 1
- tmpfd.close()
- obj_locs = get_lines(TMPFILE)
- if len(obj_locs) == 0:
- logging.error("Can't find imported object {name}".format(name=file))
- ERRORS += 1
- for obj_loc in obj_locs:
- cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
- ERRORS += 1
+ data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME)
+ ERRORS += data_errors
else:
logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES")
+ print "Test all --op dump-journal again"
+ ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]
+ ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
vstart(new=False)
wait_for_health()
if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
print "Verify erasure coded import data"
- ERRORS += verify(DATADIR, EC_POOL, EC_NAME)
+ ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db)
+ # Check replicated data/xattr/omap using rados
+ print "Verify replicated import data using rados"
+ ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db)
if EXP_ERRORS == 0:
NEWPOOL = "import-rados-pool"
cmd = "./rados mkpool {pool}".format(pool=NEWPOOL)
logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
print "Test import-rados"
for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]:
logging.error("Import-rados failed from {file} with {ret}".format(file=file, ret=ret))
ERRORS += 1
- ERRORS += verify(DATADIR, NEWPOOL, REP_NAME)
+ ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db)
else:
logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
+ # Clear directories of previous portion
+ call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
+ call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+ os.mkdir(TESTDIR)
+ os.mkdir(DATADIR)
+
+ # Cause SPLIT_POOL to split and test import with object/log filtering
+ print "Testing import all objects after a split"
+ SPLIT_POOL = "split_pool"
+ PG_COUNT = 1
+ SPLIT_OBJ_COUNT = 5
+ SPLIT_NSPACE_COUNT = 2
+ SPLIT_NAME = "split"
+ cmd = "./ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ SPLITID = get_pool_id(SPLIT_POOL, nullfd)
+ pool_size = int(check_output("./ceph osd pool get {pool} size".format(pool=SPLIT_POOL), shell=True, stderr=nullfd).split(" ")[1])
+ EXP_ERRORS = 0
+ RM_ERRORS = 0
+ IMP_ERRORS = 0
+
+ objects = range(1, SPLIT_OBJ_COUNT + 1)
+ nspaces = range(SPLIT_NSPACE_COUNT)
+ for n in nspaces:
+ nspace = get_nspace(n)
+
+ for i in objects:
+ NAME = SPLIT_NAME + "{num}".format(num=i)
+ LNAME = nspace + "-" + NAME
+ DDNAME = os.path.join(DATADIR, LNAME)
+ DDNAME += "__head"
+
+ cmd = "rm -f " + DDNAME
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ if i == 1:
+ dataline = range(DATALINECOUNT)
+ else:
+ dataline = range(1)
+ fd = open(DDNAME, "w")
+ data = "This is the split data for " + LNAME + "\n"
+ for _ in dataline:
+ fd.write(data)
+ fd.close()
+
+ cmd = "./rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.critical("Rados put command failed with {ret}".format(ret=ret))
+ return 1
+
+ wait_for_health()
+ kill_daemons()
+
+ for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]:
+ os.mkdir(os.path.join(TESTDIR, osd))
+
+ pg = "{pool}.0".format(pool=SPLITID)
+ EXPORT_PG = pg
+
+ export_osds = get_osds(pg, OSDDIR)
+ for osd in export_osds:
+ mydir = os.path.join(TESTDIR, osd)
+ fname = os.path.join(mydir, pg)
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ EXP_ERRORS += 1
+
+ ERRORS += EXP_ERRORS
+
+ if EXP_ERRORS == 0:
+ vstart(new=False)
+ wait_for_health()
+
+ time.sleep(20)
+
+ cmd = "./ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ time.sleep(5)
+ wait_for_health()
+
+ time.sleep(15)
+
+ kill_daemons()
+
+ # Now 2 PGs, poolid.0 and poolid.1
+ for seed in range(2):
+ pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed)
+
+ which = 0
+ for osd in get_osds(pg, OSDDIR):
+ cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+
+ # This is weird. The export files are based on only the EXPORT_PG
+ # and where that pg was before the split. Use 'which' to use all
+ # export copies in import.
+ mydir = os.path.join(TESTDIR, export_osds[which])
+ fname = os.path.join(mydir, EXPORT_PG)
+ which += 1
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+ IMP_ERRORS += 1
+
+ ERRORS += IMP_ERRORS
+
+ # Start up again to make sure imports didn't corrupt anything
+ if IMP_ERRORS == 0:
+ print "Verify split import data"
+ data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME)
+ ERRORS += data_errors
+ if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size):
+ logging.error("Incorrect number of replicas seen {count}".format(count=count))
+ ERRORS += 1
+ vstart(new=False)
+ wait_for_health()
+
call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+ # vstart() starts 4 OSDs
+ ERRORS += test_get_set_osdmap(CFSD_PREFIX, range(4), ALLOSDS)
+ ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
if ERRORS == 0:
print "TEST PASSED"
return 0
status = main(sys.argv[1:])
finally:
kill_daemons()
- call("/bin/rm -fr ceph_objectstore_tool_dir", shell=True)
+ call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
sys.exit(status)
--caps=<caps> list of caps (e.g., "usage=read, write; user=read"
--yes-i-really-mean-it required for certain operations
--reset-regions reset regionmap when regionmap update
+ --bypass-gc when specified with bucket deletion, triggers
+ object deletions by not involving GC
+ --inconsistent-index when specified with bucket deletion and bypass-gc set to true,
+ ignores bucket index consistency
+
<date> := "YYYY-MM-DD[ hh:mm:ss]"
Quota options:
TYPED_TEST(BitVectorTest, get_buffer_extents) {
typename TestFixture::bit_vector_t bit_vector;
- uint64_t element_count = 2 * CEPH_PAGE_SIZE + 51;
+ uint64_t element_count = 2 * bit_vector.BLOCK_SIZE + 51;
uint64_t elements_per_byte = 8 / bit_vector.BIT_COUNT;
bit_vector.resize(element_count * elements_per_byte);
- uint64_t offset = (CEPH_PAGE_SIZE + 11) * elements_per_byte;
- uint64_t length = (CEPH_PAGE_SIZE + 31) * elements_per_byte;
+ uint64_t offset = (bit_vector.BLOCK_SIZE + 11) * elements_per_byte;
+ uint64_t length = (bit_vector.BLOCK_SIZE + 31) * elements_per_byte;
uint64_t byte_offset;
uint64_t byte_length;
bit_vector.get_data_extents(offset, length, &byte_offset, &byte_length);
- ASSERT_EQ(CEPH_PAGE_SIZE, byte_offset);
- ASSERT_EQ(CEPH_PAGE_SIZE + (element_count % CEPH_PAGE_SIZE), byte_length);
+ ASSERT_EQ(bit_vector.BLOCK_SIZE, byte_offset);
+ ASSERT_EQ(bit_vector.BLOCK_SIZE + (element_count % bit_vector.BLOCK_SIZE),
+ byte_length);
bit_vector.get_data_extents(1, 1, &byte_offset, &byte_length);
ASSERT_EQ(0U, byte_offset);
- ASSERT_EQ(CEPH_PAGE_SIZE, byte_length);
+ ASSERT_EQ(bit_vector.BLOCK_SIZE, byte_length);
}
TYPED_TEST(BitVectorTest, get_header_length) {
Extents extents = boost::assign::list_of(
std::make_pair(0, 1))(
- std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))(
- std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))(
- std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))(
- std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))(
- std::make_pair(2, 2 * CEPH_PAGE_SIZE));
+ std::make_pair((bit_vector.BLOCK_SIZE * elements_per_byte) - 2, 4))(
+ std::make_pair((bit_vector.BLOCK_SIZE * elements_per_byte) + 2, 2))(
+ std::make_pair((2 * bit_vector.BLOCK_SIZE * elements_per_byte) - 2, 4))(
+ std::make_pair((2 * bit_vector.BLOCK_SIZE * elements_per_byte) + 2, 2))(
+ std::make_pair(2, 2 * bit_vector.BLOCK_SIZE));
for (Extents::iterator it = extents.begin(); it != extents.end(); ++it) {
uint64_t element_offset = it->first;
uint64_t element_length = it->second;
typename TestFixture::bit_vector_t bit_vector2;
uint64_t elements_per_byte = 8 / bit_vector1.BIT_COUNT;
- bit_vector1.resize((CEPH_PAGE_SIZE + 1) * elements_per_byte);
- bit_vector2.resize((CEPH_PAGE_SIZE + 1) * elements_per_byte);
+ bit_vector1.resize((bit_vector1.BLOCK_SIZE + 1) * elements_per_byte);
+ bit_vector2.resize((bit_vector2.BLOCK_SIZE + 1) * elements_per_byte);
uint64_t byte_offset;
uint64_t byte_length;
bit_vector1.encode_data(data, byte_offset, byte_length);
bufferlist::iterator data_it = data.begin();
- bit_vector1.decode_data(data_it, byte_offset);
+ bit_vector1.decode_data(data_it, byte_offset);
bit_vector2[bit_vector2.size() - 1] = 1;
#include "include/types.h"
#include "auth/Crypto.h"
+#include "common/Clock.h"
#include "common/ceph_crypto.h"
+#include "common/ceph_context.h"
+#include "global/global_context.h"
#include "test/unit.h"
bufferlist cipher;
std::string error;
- h->encrypt(secret, plaintext, cipher, error);
+ CryptoKeyHandler *kh = h->get_key_handler(secret, error);
+ int r = kh->encrypt(plaintext, cipher, &error);
+ ASSERT_EQ(r, 0);
ASSERT_EQ(error, "");
unsigned char want_cipher[] = {
std::string error;
bufferlist plaintext;
- h->decrypt(secret, cipher, plaintext, error);
+ CryptoKeyHandler *kh = h->get_key_handler(secret, error);
+ int r = kh->decrypt(cipher, plaintext, &error);
+ ASSERT_EQ(r, 0);
ASSERT_EQ(error, "");
ASSERT_EQ(sizeof(plaintext_s), plaintext.length());
CryptoHandler *h = g_ceph_context->get_crypto_handler(CEPH_CRYPTO_AES);
std::string error;
- h->encrypt(secret, plaintext, cipher, error);
+ CryptoKeyHandler *kh = h->get_key_handler(secret, error);
+ int r = kh->encrypt(plaintext, cipher, &error);
+ ASSERT_EQ(r, 0);
ASSERT_EQ(error, "");
}
plaintext.clear();
{
CryptoHandler *h = g_ceph_context->get_crypto_handler(CEPH_CRYPTO_AES);
std::string error;
- h->decrypt(secret, cipher, plaintext, error);
+ CryptoKeyHandler *ckh = h->get_key_handler(secret, error);
+ int r = ckh->decrypt(cipher, plaintext, &error);
+ ASSERT_EQ(r, 0);
ASSERT_EQ(error, "");
}
}
err = memcmp(plaintext_s, orig_plaintext_s, sizeof(orig_plaintext_s));
ASSERT_EQ(0, err);
}
+
+TEST(AES, LoopKey) {
+ bufferptr k(16);
+ get_random_bytes(k.c_str(), k.length());
+ CryptoKey key(CEPH_CRYPTO_AES, ceph_clock_now(NULL), k);
+
+ bufferlist data;
+ bufferptr r(128);
+ get_random_bytes(r.c_str(), r.length());
+ data.append(r);
+
+ utime_t start = ceph_clock_now(NULL);
+ int n = 100000;
+
+ for (int i=0; i<n; ++i) {
+ bufferlist encoded;
+ string error;
+ int r = key.encrypt(g_ceph_context, data, encoded, &error);
+ ASSERT_EQ(r, 0);
+ }
+
+ utime_t end = ceph_clock_now(NULL);
+ utime_t dur = end - start;
+ cout << n << " encoded in " << dur << std::endl;
+}
}
TEST(DaemonConfig, InvalidIntegers) {
- {
- int ret = g_ceph_context->_conf->set_val("num_client", "-1");
- ASSERT_EQ(ret, -EINVAL);
- }
- {
- int ret = g_ceph_context->_conf->set_val("num_client", "-1K");
- ASSERT_EQ(ret, -EINVAL);
- }
{
long long bad_value = (long long)std::numeric_limits<int>::max() + 1;
string str = boost::lexical_cast<string>(bad_value);
rados_buffer_free(buf);
rados_buffer_free(st);
+ cmd[0] = (char *)"";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "{}", 2, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
+ cmd[0] = (char *)"{}";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
+ cmd[0] = (char *)"{\"abc\":\"something\"}";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
+ cmd[0] = (char *)"{\"prefix\":\"\"}";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
+ cmd[0] = (char *)"{\"prefix\":\" \"}";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
+ cmd[0] = (char *)"{\"prefix\":\";;;,,,;;,,\"}";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
+ cmd[0] = (char *)"{\"prefix\":\"extra command\"}";
+ ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+
cmd[0] = (char *)"{\"prefix\":\"mon_status\"}";
ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
ASSERT_LT(0u, buflen);
return "";
}
-int destroy_ec_profile(rados_t *cluster)
+int destroy_ec_profile(rados_t *cluster, std::ostream &oss)
{
- char *cmd[2];
- cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
- cmd[1] = NULL;
- return rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+ char *cmd[2];
+ cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
+ cmd[1] = NULL;
+ int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+ if (ret)
+ oss << "rados_mon_command: erasure-code-profile rm testprofile failed with error " << ret;
+ return ret;
+}
+
+int destroy_ruleset(rados_t *cluster,
+ std::string ruleset,
+ std::ostream &oss)
+{
+ char *cmd[2];
+ std::string tmp = ("{\"prefix\": \"osd crush rule rm\", \"name\":\"" +
+ ruleset + "\"}");
+ cmd[0] = (char*)tmp.c_str();
+ cmd[1] = NULL;
+ int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+ if (ret)
+ oss << "rados_mon_command: osd crush rule rm " + ruleset + " failed with error " << ret;
+ return ret;
+}
+
+int destroy_ec_profile_and_ruleset(rados_t *cluster,
+ std::string ruleset,
+ std::ostream &oss)
+{
+ int ret;
+ ret = destroy_ec_profile(cluster, oss);
+ if (ret)
+ return ret;
+ return destroy_ruleset(cluster, ruleset, oss);
}
std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
if (err.length())
return err;
- int ret = destroy_ec_profile(cluster);
+ std::ostringstream oss;
+ int ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss);
if (ret) {
rados_shutdown(*cluster);
- std::ostringstream oss;
- oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret;
return oss.str();
}
cmd[0] = (char *)profile_create.c_str();
ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
if (ret) {
- std::ostringstream oss;
-
rados_shutdown(*cluster);
oss << "rados_mon_command erasure-code-profile set name:testprofile failed with error " << ret;
return oss.str();
cmd[0] = (char *)cmdstr.c_str();
ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
if (ret) {
- std::ostringstream oss;
-
- int ret2 = destroy_ec_profile(cluster);
- if (ret2)
- oss << "rados_mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl;
-
+ destroy_ec_profile(cluster, oss);
rados_shutdown(*cluster);
oss << "rados_mon_command osd pool create failed with error " << ret;
return oss.str();
return "";
}
-int destroy_ec_profile_pp(Rados &cluster)
+int destroy_ruleset_pp(Rados &cluster,
+ std::string ruleset,
+ std::ostream &oss)
+{
+ bufferlist inbl;
+ int ret = cluster.mon_command("{\"prefix\": \"osd crush rule rm\", \"name\":\"" +
+ ruleset + "\"}", inbl, NULL, NULL);
+ if (ret)
+ oss << "mon_command: osd crush rule rm " + ruleset + " failed with error " << ret << std::endl;
+ return ret;
+}
+
+int destroy_ec_profile_pp(Rados &cluster, std::ostream &oss)
{
bufferlist inbl;
- return cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
- inbl, NULL, NULL);
+ int ret = cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
+ inbl, NULL, NULL);
+ if (ret)
+ oss << "mon_command: osd erasure-code-profile rm testprofile failed with error " << ret << std::endl;
+ return ret;
+}
+
+int destroy_ec_profile_and_ruleset_pp(Rados &cluster,
+ std::string ruleset,
+ std::ostream &oss)
+{
+ int ret;
+ ret = destroy_ec_profile_pp(cluster, oss);
+ if (ret)
+ return ret;
+ return destroy_ruleset_pp(cluster, ruleset, oss);
}
std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
if (err.length())
return err;
- int ret = destroy_ec_profile_pp(cluster);
+ std::ostringstream oss;
+ int ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss);
if (ret) {
cluster.shutdown();
- std::ostringstream oss;
- oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret;
return oss.str();
}
inbl, NULL, NULL);
if (ret) {
cluster.shutdown();
- std::ostringstream oss;
oss << "mon_command erasure-code-profile set name:testprofile failed with error " << ret;
return oss.str();
}
"{\"prefix\": \"osd pool create\", \"pool\": \"" + pool_name + "\", \"pool_type\":\"erasure\", \"pg_num\":8, \"pgp_num\":8, \"erasure_code_profile\":\"testprofile\"}",
inbl, NULL, NULL);
if (ret) {
- std::ostringstream oss;
bufferlist inbl;
- int ret2 = destroy_ec_profile_pp(cluster);
- if (ret2)
- oss << "mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl;
-
+ destroy_ec_profile_pp(cluster, oss);
cluster.shutdown();
oss << "mon_command osd pool create pool:" << pool_name << " pool_type:erasure failed with error " << ret;
return oss.str();
int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster)
{
int ret = rados_pool_delete(*cluster, pool_name.c_str());
- if (ret == 0) {
- int ret2 = destroy_ec_profile(cluster);
- if (ret2) {
- rados_shutdown(*cluster);
- return ret2;
- }
- rados_wait_for_latest_osdmap(*cluster);
+ if (ret) {
+ rados_shutdown(*cluster);
+ return ret;
+ }
+
+ std::ostringstream oss;
+ ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss);
+ if (ret) {
+ rados_shutdown(*cluster);
+ return ret;
}
+
+ rados_wait_for_latest_osdmap(*cluster);
rados_shutdown(*cluster);
return ret;
}
int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
{
int ret = cluster.pool_delete(pool_name.c_str());
- bufferlist inbl;
- if (ret == 0) {
- int ret2 = destroy_ec_profile_pp(cluster);
- if (ret2) {
- cluster.shutdown();
- return ret2;
- }
- cluster.wait_for_latest_osdmap();
+ if (ret) {
+ cluster.shutdown();
+ return ret;
}
+
+ std::ostringstream oss;
+ ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss);
+ if (ret) {
+ cluster.shutdown();
+ return ret;
+ }
+
+ cluster.wait_for_latest_osdmap();
cluster.shutdown();
return ret;
}
{
public:
TestAlarm() {
- alarm(360);
+ alarm(1200);
}
~TestAlarm() {
alarm(0);
rados_ioctx_destroy(ioctx);
}
+TEST_F(TestLibRBD, Flatten)
+{
+ REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+ librbd::RBD rbd;
+ std::string parent_name = get_temp_image_name();
+ uint64_t size = 2 << 20;
+ int order = 0;
+ ASSERT_EQ(0, create_image_pp(rbd, ioctx, parent_name.c_str(), size, &order));
+
+ librbd::Image parent_image;
+ ASSERT_EQ(0, rbd.open(ioctx, parent_image, parent_name.c_str(), NULL));
+
+ bufferlist bl;
+ bl.append(std::string(4096, '1'));
+ ASSERT_EQ(bl.length(), parent_image.write(0, bl.length(), bl));
+
+ ASSERT_EQ(0, parent_image.snap_create("snap1"));
+ ASSERT_EQ(0, parent_image.snap_protect("snap1"));
+
+ uint64_t features;
+ ASSERT_EQ(0, parent_image.features(&features));
+
+ std::string clone_name = get_temp_image_name();
+ EXPECT_EQ(0, rbd.clone(ioctx, parent_name.c_str(), "snap1", ioctx,
+ clone_name.c_str(), features, &order));
+
+ librbd::Image clone_image;
+ ASSERT_EQ(0, rbd.open(ioctx, clone_image, clone_name.c_str(), NULL));
+ ASSERT_EQ(0, clone_image.flatten());
+
+ librbd::RBD::AioCompletion *read_comp =
+ new librbd::RBD::AioCompletion(NULL, NULL);
+ bufferlist read_bl;
+ clone_image.aio_read(0, bl.length(), read_bl, read_comp);
+ ASSERT_EQ(0, read_comp->wait_for_complete());
+ ASSERT_EQ(bl.length(), read_comp->get_return_value());
+ read_comp->release();
+ ASSERT_TRUE(bl.contents_equal(read_bl));
+
+ ASSERT_PASSED(validate_object_map, clone_image);
+}
+
TEST_F(TestLibRBD, SnapCreateViaLockOwner)
{
REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
ASSERT_EQ(0, read_comp->wait_for_complete());
read_comp->release();
}
+
+TEST_F(TestLibRBD, FlushCacheWithCopyupOnExternalSnapshot) {
+ REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+ librbd::RBD rbd;
+ librbd::Image image;
+ std::string name = get_temp_image_name();
+
+ uint64_t size = 1 << 18;
+ int order = 0;
+
+ ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+ ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+ bufferlist bl;
+ bl.append(std::string(size, '1'));
+ ASSERT_EQ((int)size, image.write(0, size, bl));
+ ASSERT_EQ(0, image.snap_create("one"));
+ ASSERT_EQ(0, image.snap_protect("one"));
+
+ std::string clone_name = this->get_temp_image_name();
+ ASSERT_EQ(0, rbd.clone(ioctx, name.c_str(), "one", ioctx, clone_name.c_str(),
+ RBD_FEATURE_LAYERING, &order));
+ ASSERT_EQ(0, rbd.open(ioctx, image, clone_name.c_str(), NULL));
+
+ librbd::Image image2;
+ ASSERT_EQ(0, rbd.open(ioctx, image2, clone_name.c_str(), NULL));
+
+ // prepare CoW writeback that will be flushed on next op
+ bl.clear();
+ bl.append(std::string(1, '1'));
+ ASSERT_EQ(0, image.flush());
+ ASSERT_EQ(1, image.write(0, 1, bl));
+ ASSERT_EQ(0, image2.snap_create("snap1"));
+
+ librbd::RBD::AioCompletion *read_comp =
+ new librbd::RBD::AioCompletion(NULL, NULL);
+ bufferlist read_bl;
+ image.aio_read(0, 1024, read_bl, read_comp);
+ ASSERT_EQ(0, read_comp->wait_for_complete());
+ read_comp->release();
+}
function run() {
local dir=$1
+ shift
export CEPH_MON="127.0.0.1:7102"
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
- setup $dir || return 1
- run_mon $dir a --public-addr $CEPH_MON
- FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for TEST_function in $FUNCTIONS ; do
- if ! $TEST_function $dir ; then
- cat $dir/a/log
- return 1
- fi
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
done
- teardown $dir || return 1
}
TEST_POOL=rbd
function TEST_osd_pool_get_set() {
- local dir=$1 flag
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+
+ local flag
for flag in hashpspool nodelete nopgchange nosizechange; do
if [ $flag = hashpspool ]; then
./ceph osd dump | grep 'pool 0' | grep $flag || return 1
! ./ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1
! ./ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1
+ teardown $dir || return 1
+
+}
+
+function TEST_no_segfault_for_bad_keyring() {
+ local dir=$1
+ setup $dir || return 1
+ # create a client.admin key and add it to ceph.mon.keyring
+ ceph-authtool --create-keyring $dir/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
+ ceph-authtool --create-keyring $dir/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *'
+ ceph-authtool $dir/ceph.mon.keyring --import-keyring $dir/ceph.client.admin.keyring
+ CEPH_ARGS_TMP="--fsid=$(uuidgen) --mon-host=127.0.0.1:7102 --auth-supported=cephx "
+ CEPH_ARGS_orig=$CEPH_ARGS
+ CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/ceph.mon.keyring "
+ run_mon $dir a
+ # create a bad keyring and make sure no segfault occurs when using the bad keyring
+ echo -e "[client.admin]\nkey = BQAUlgtWoFePIxAAQ9YLzJSVgJX5V1lh5gyctg==" > $dir/bad.keyring
+ CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/bad.keyring"
+ ceph osd dump 2> /dev/null
+ # 139(11|128) means segfault and core dumped
+ [ $? -eq 139 ] && return 1
+ CEPH_ARGS=$CEPH_ARGS_orig
+ teardown $dir || return 1
}
-main misc
+main misc "$@"
# Local Variables:
# compile-command: "cd ../.. ; make -j4 && test/mon/misc.sh"
global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
common_init_finish(g_ceph_context);
g_ceph_context->_conf->apply_changes(NULL);
+ Cycles::init();
cerr << "args: " << args << std::endl;
if (args.size() < 1) {
}
}
+ghobject_t generate_long_name(unsigned i)
+{
+ stringstream name;
+ name << "object id " << i << " ";
+ for (unsigned j = 0; j < 500; ++j) name << 'a';
+ ghobject_t hoid(hobject_t(sobject_t(name.str(), CEPH_NOSNAP)));
+ hoid.hobj.set_hash(i % 2);
+ return hoid;
+}
+
+TEST_P(StoreTest, LongnameSplitTest) {
+ ObjectStore::Sequencer osr("test");
+ int r;
+ coll_t cid;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid);
+ cerr << "Creating collection " << cid << std::endl;
+ r = store->apply_transaction(&osr, t);
+ ASSERT_EQ(r, 0);
+ }
+ for (unsigned i = 0; i < 320; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t hoid = generate_long_name(i);
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = store->apply_transaction(&osr, t);
+ }
+
+ ghobject_t test_obj = generate_long_name(319);
+ ghobject_t test_obj_2 = test_obj;
+ test_obj_2.generation = 0;
+ test_obj_2.shard_id = shard_id_t(0);
+ {
+ ObjectStore::Transaction t;
+ // should cause a split
+ t.collection_move_rename(
+ cid, test_obj,
+ cid, test_obj_2);
+ r = store->apply_transaction(&osr, t);
+ }
+
+ for (unsigned i = 0; i < 319; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t hoid = generate_long_name(i);
+ t.remove(cid, hoid);
+ cerr << "Removing object " << hoid << std::endl;
+ r = store->apply_transaction(&osr, t);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, test_obj_2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = store->apply_transaction(&osr, t);
+ ASSERT_EQ(r, 0);
+ }
+
+}
+
TEST_P(StoreTest, ManyObjectTest) {
int NUM_OBJS = 2000;
int r = 0;
name = "DIR_" + name;
}
+ if (seq % 2) {
+ for (unsigned i = 0; i < 300; ++i) {
+ name.push_back('a');
+ }
+ }
+
// hash
//boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5);
++seq;
}
};
+ class C_SyntheticOnStash : public Context {
+ public:
+ SyntheticWorkloadState *state;
+ ObjectStore::Transaction *t;
+ ghobject_t oid, noid;
+
+ C_SyntheticOnStash(SyntheticWorkloadState *state,
+ ObjectStore::Transaction *t, ghobject_t oid,
+ ghobject_t noid)
+ : state(state), t(t), oid(oid), noid(noid) {}
+
+ void finish(int r) {
+ Mutex::Locker locker(state->lock);
+ ASSERT_TRUE(state->in_flight_objects.count(oid));
+ ASSERT_EQ(r, 0);
+ state->in_flight_objects.erase(oid);
+ if (state->contents.count(noid))
+ state->available_objects.insert(noid);
+ --(state->in_flight);
+ bufferlist r2;
+ r = state->store->read(
+ state->cid, noid, 0,
+ state->contents[noid].data.length(), r2);
+ if (!state->contents[noid].data.contents_equal(r2)) {
+ assert(0 == " mismatch after clone");
+ ASSERT_TRUE(state->contents[noid].data.contents_equal(r2));
+ }
+ state->cond.Signal();
+ delete t;
+ }
+ };
+
class C_SyntheticOnClone : public Context {
public:
SyntheticWorkloadState *state;
return store->queue_transaction(osr, t, new C_SyntheticOnReadable(this, t, new_obj));
}
+ int stash() {
+ Mutex::Locker locker(lock);
+ if (!can_unlink())
+ return -ENOENT;
+ if (!can_create())
+ return -ENOSPC;
+ wait_for_ready();
+
+ ghobject_t old_obj;
+ int max = 20;
+ do {
+ old_obj = get_uniform_random_object();
+ } while (--max && !contents[old_obj].data.length());
+ available_objects.erase(old_obj);
+ ghobject_t new_obj = old_obj;
+ new_obj.generation++;
+ new_obj.shard_id = shard_id_t(0);
+ available_objects.erase(new_obj);
+
+ ObjectStore::Transaction *t = new ObjectStore::Transaction;
+ t->collection_move_rename(cid, old_obj, cid, new_obj);
+ ++in_flight;
+ in_flight_objects.insert(old_obj);
+
+ // *copy* the data buffer, since we may modify it later.
+ contents[new_obj].attrs = contents[old_obj].attrs;
+ contents[new_obj].data.clear();
+ contents[new_obj].data.append(contents[old_obj].data.c_str(),
+ contents[old_obj].data.length());
+ contents.erase(old_obj);
+ int status = store->queue_transaction(
+ osr, t,
+ new C_SyntheticOnStash(this, t, old_obj, new_obj));
+ return status;
+ }
+
int clone() {
Mutex::Locker locker(lock);
if (!can_unlink())
test_obj.write();
} else if (val > 50) {
test_obj.clone();
+ } else if (val > 30) {
+ test_obj.stash();
} else if (val > 10) {
test_obj.read();
} else {
test_obj.setattrs();
} else if (val > 45) {
test_obj.clone();
+ } else if (val > 37) {
+ test_obj.stash();
} else if (val > 30) {
test_obj.getattrs();
} else {
EXPECT_TRUE(dirty_info);
EXPECT_TRUE(dirty_big_info);
}
+
+ // Test for 13965
+ {
+ clear();
+
+ ObjectStore::Transaction t;
+ list<hobject_t> remove_snap;
+ pg_info_t info;
+ info.log_tail = log.tail = eversion_t(1, 5);
+ info.last_update = eversion_t(1, 6);
+ bool dirty_info = false;
+ bool dirty_big_info = false;
+
+ {
+ pg_log_entry_t e;
+ e.mod_desc.mark_unrollbackable();
+ e.version = eversion_t(1, 5);
+ e.soid.set_hash(0x9);
+ add(e);
+ }
+ {
+ pg_log_entry_t e;
+ e.mod_desc.mark_unrollbackable();
+ e.version = eversion_t(1, 6);
+ e.soid.set_hash(0x10);
+ add(e);
+ }
+ TestHandler h(remove_snap);
+ trim_rollback_info(eversion_t(1, 6), &h);
+ rewind_divergent_log(t, eversion_t(1, 5), info, &h,
+ dirty_info, dirty_big_info);
+ pg_log_t log;
+ claim_log_and_clear_rollback_info(log, &h);
+ }
}
TEST_F(PGLogTest, merge_old_entry) {
if (m_op <= m_objects) {
stringstream oid;
oid << m_op;
+ if (m_op % 2) {
+ // make it a long name
+ oid << " ";
+ for (unsigned i = 0; i < 300; ++i) {
+ oid << i;
+ }
+ }
cout << m_op << ": write initial oid " << oid.str() << std::endl;
context.oid_not_flushing.insert(oid.str());
if (m_ec_pool) {
teardown $dir || return 1
}
+function TEST_unfound_erasure_coded() {
+ local dir=$1
+ local poolname=ecpool
+ local payload=ABCDEF
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ run_osd $dir 3 || return 1
+ wait_for_clean || return 1
+
+ ceph osd erasure-code-profile set myprofile \
+ k=2 m=2 ruleset-failure-domain=osd || return 1
+ ceph osd pool create $poolname 1 1 erasure myprofile \
+ || return 1
+
+ add_something $dir $poolname
+
+ local primary=$(get_primary $poolname SOMETHING)
+ local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//"))
+ local not_primary_first=${osds[0]}
+ local not_primary_second=${osds[1]}
+ local not_primary_third=${osds[2]}
+
+ #
+ # 1) remove the corresponding file from the OSDs
+ #
+ objectstore_tool $dir $not_primary_first SOMETHING remove || return 1
+ objectstore_tool $dir $not_primary_second SOMETHING remove || return 1
+ objectstore_tool $dir $not_primary_third SOMETHING remove || return 1
+ #
+ # 2) repair the PG
+ #
+ local pg=$(get_pg $poolname SOMETHING)
+ repair $pg
+ #
+ # 3) check pg state
+ #
+ ceph -s|grep "4 osds: 4 up, 4 in" || return 1
+ ceph -s|grep "1/1 unfound" || return 1
+
+ teardown $dir || return 1
+}
+
function corrupt_and_repair_two() {
local dir=$1
local poolname=$2
--- /dev/null
+#! /bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source test/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function TEST_scrub_snaps() {
+ local dir=$1
+ local poolname=test
+
+ TESTDATA="testdata.$$"
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_osd $dir 0 || return 1
+
+ wait_for_clean || return 1
+
+ # Create a pool with a single pg
+ ceph osd pool create $poolname 1 1
+ poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+ dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+ for i in `seq 1 14`
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+
+ SNAP=1
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj1 $TESTDATA
+ rados -p $poolname put obj5 $TESTDATA
+ rados -p $poolname put obj3 $TESTDATA
+ for i in `seq 6 14`
+ do rados -p $poolname put obj${i} $TESTDATA
+ done
+
+ SNAP=2
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj5 $TESTDATA
+
+ SNAP=3
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj3 $TESTDATA
+
+ SNAP=4
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj5 $TESTDATA
+ rados -p $poolname put obj2 $TESTDATA
+
+ SNAP=5
+ rados -p $poolname mksnap snap${SNAP}
+ SNAP=6
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj5 $TESTDATA
+
+ SNAP=7
+ rados -p $poolname mksnap snap${SNAP}
+
+ rados -p $poolname rm obj4
+ rados -p $poolname rm obj2
+
+ kill_daemons $dir KILL osd || return 1
+ sleep 5
+
+ # Don't need to ceph_objectstore_tool function because osd stopped
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj1 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)"
+ OBJ5SAVE="$JSON"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)"
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=18
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj3 | grep \"snapid\":-2)"
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=15
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj2 | grep \"snapid\":-1)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset
+
+ # Create a clone which isn't in snapset and doesn't have object info
+ JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=7
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+ rm -f $TESTDATA
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj6 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj7 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj8 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj9 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj10 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj11 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj12 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj13 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj14 | grep \"snapid\":-2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size
+
+ run_osd $dir 0 || return 1
+ wait_for_clean || return 1
+
+ sleep 5
+ ceph pg scrub ${poolid}.0
+ timeout 30 ceph -w
+
+ for i in `seq 1 7`
+ do
+ rados -p $poolname rmsnap snap$i
+ done
+
+ sleep 10
+
+ ERRORS=0
+
+ pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
+ pid=$(cat $pidfile)
+ if ! kill -0 $pid
+ then
+ echo "OSD crash occurred"
+ tail -100 $dir/osd.0.log
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+
+ kill_daemons $dir || return 1
+
+ declare -a err_strings
+ err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/2acecc8b/obj10/1 is missing in clone_overlap"
+ err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/7 no '_' attr"
+ err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/7 is an unexpected clone"
+ err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/4 on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]"
+ err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head expected clone [0-9]*/666934a3/obj5/2"
+ err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head expected clone [0-9]*/666934a3/obj5/1"
+ err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head 1 missing clone[(]s[)]"
+ err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/d3a9faf5/obj12/head snapset.head_exists=false, but head exists"
+ err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/8df7eaa5/obj8/head snaps.seq not set"
+ err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/5c889059/obj7/head snapset.head_exists=false, but head exists"
+ err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/5c889059/obj7/1 is an unexpected clone"
+ err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/61f68bb1/obj3/head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
+ err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/83425cc4/obj6/1 is an unexpected clone"
+ err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/snapdir no 'snapset' attr"
+ err_strings[14]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/7 clone ignored due to missing snapset"
+ err_strings[15]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/4 clone ignored due to missing snapset"
+ err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/a8759770/obj4/snapdir expected clone [0-9]*/a8759770/obj4/7"
+ err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/a8759770/obj4/snapdir 1 missing clone[(]s[)]"
+ err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/6cf8deff/obj1/1 is an unexpected clone"
+ err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/e478ac7f/obj9/1 is missing in clone_size"
+ err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/29547577/obj11/1 is an unexpected clone"
+ err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/94122507/obj14/1 size 1032 != clone_size 1033"
+ err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 19 errors"
+
+ for i in `seq 0 ${#err_strings[@]}`
+ do
+ if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null;
+ then
+ echo "Missing log message '${err_strings[$i]}'"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ done
+
+ teardown $dir || return 1
+
+ if [ $ERRORS != "0" ];
+ then
+ echo "TEST FAILED WITH $ERRORS ERRORS"
+ return 1
+ fi
+
+ echo "TEST PASSED"
+ return 0
+}
+
+main osd-scrub-snaps "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && \
+# test/osd/osd-scrub-snaps.sh"
def test_reweight_by_utilization(self):
self.assert_valid_command(['osd', 'reweight-by-utilization'])
self.assert_valid_command(['osd', 'reweight-by-utilization', '100'])
- assert_equal({}, validate_command(sigdict, ['osd',
- 'reweight-by-utilization',
- '50']))
+ self.assert_valid_command(['osd', 'reweight-by-utilization', '100', '.1'])
+ self.assert_valid_command(['osd', 'reweight-by-utilization', '--no-increasing'])
assert_equal({}, validate_command(sigdict, ['osd',
'reweight-by-utilization',
'100',
Cond sync_cond;
char path[200];
uuid_d fsid;
-bool directio = false;
-bool aio = false;
+struct test_info {
+ bool directio, aio, faio;
+ const char *description;
+} subtests[3] = {
+ { false, false, false, "DIRECTIO OFF AIO OFF" },
+ { true, false, false, "DIRECTIO ON AIO OFF" },
+ { true, true, true, "DIRECTIO ON AIO ON"}
+};
// ----
Cond cond;
finisher->start();
- cout << "DIRECTIO OFF AIO OFF" << std::endl;
- directio = false;
- aio = false;
int r = RUN_ALL_TESTS();
- if (r >= 0) {
- cout << "DIRECTIO ON AIO OFF" << std::endl;
- directio = true;
- r = RUN_ALL_TESTS();
-
- if (r >= 0) {
- cout << "DIRECTIO ON AIO ON" << std::endl;
- aio = true;
- r = RUN_ALL_TESTS();
- }
- }
finisher->stop();
}
TEST(TestFileJournal, Create) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
+
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ }
}
TEST(TestFileJournal, WriteSmall) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
- bufferlist bl;
- bl.append("small");
- j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
- wait();
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
- j.close();
+ bufferlist bl;
+ bl.append("small");
+ j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+ wait();
+
+ j.close();
+ }
}
TEST(TestFileJournal, WriteBig) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
- bufferlist bl;
- while (bl.length() < size_mb*1000/2) {
- char foo[1024*1024];
- memset(foo, 1, sizeof(foo));
- bl.append(foo, sizeof(foo));
- }
- j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
- wait();
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ bufferlist bl;
+ while (bl.length() < size_mb*1000/2) {
+ char foo[1024*1024];
+ memset(foo, 1, sizeof(foo));
+ bl.append(foo, sizeof(foo));
+ }
+ j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+ wait();
- j.close();
+ j.close();
+ }
}
TEST(TestFileJournal, WriteMany) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-
- bufferlist bl;
- bl.append("small");
- uint64_t seq = 1;
- for (int i=0; i<100; i++) {
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+ bufferlist bl;
bl.append("small");
- j.submit_entry(seq++, bl, 0, gb.new_sub());
- }
+ uint64_t seq = 1;
+ for (int i=0; i<100; i++) {
+ bl.append("small");
+ j.submit_entry(seq++, bl, 0, gb.new_sub());
+ }
- gb.activate();
+ gb.activate();
- wait();
+ wait();
- j.close();
+ j.close();
+ }
}
TEST(TestFileJournal, WriteManyVecs) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
-
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-
- bufferlist first;
- first.append("small");
- j.submit_entry(1, first, 0, gb.new_sub());
-
- bufferlist bl;
- for (int i=0; i<IOV_MAX * 2; i++) {
- bufferptr bp = buffer::create_page_aligned(4096);
- memset(bp.c_str(), (char)i, 4096);
- bl.append(bp);
- }
- bufferlist origbl = bl;
- j.submit_entry(2, bl, 0, gb.new_sub());
- gb.activate();
- wait();
-
- j.close();
-
- j.open(1);
- bufferlist inbl;
- string v;
- uint64_t seq = 0;
- ASSERT_EQ(true, j.read_entry(inbl, seq));
- ASSERT_EQ(seq, 2ull);
- ASSERT_TRUE(inbl.contents_equal(origbl));
- j.make_writeable();
- j.close();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
+
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+ bufferlist first;
+ first.append("small");
+ j.submit_entry(1, first, 0, gb.new_sub());
+
+ bufferlist bl;
+ for (int i=0; i<IOV_MAX * 2; i++) {
+ bufferptr bp = buffer::create_page_aligned(4096);
+ memset(bp.c_str(), (char)i, 4096);
+ bl.append(bp);
+ }
+ bufferlist origbl = bl;
+ j.submit_entry(2, bl, 0, gb.new_sub());
+ gb.activate();
+ wait();
+
+ j.close();
+
+ j.open(1);
+ bufferlist inbl;
+ string v;
+ uint64_t seq = 0;
+ ASSERT_EQ(true, j.read_entry(inbl, seq));
+ ASSERT_EQ(seq, 2ull);
+ ASSERT_TRUE(inbl.contents_equal(origbl));
+ j.make_writeable();
+ j.close();
+
+ }
}
TEST(TestFileJournal, ReplaySmall) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
-
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-
- bufferlist bl;
- bl.append("small");
- j.submit_entry(1, bl, 0, gb.new_sub());
- bl.append("small");
- j.submit_entry(2, bl, 0, gb.new_sub());
- bl.append("small");
- j.submit_entry(3, bl, 0, gb.new_sub());
- gb.activate();
- wait();
-
- j.close();
-
- j.open(1);
-
- bufferlist inbl;
- string v;
- uint64_t seq = 0;
- ASSERT_EQ(true, j.read_entry(inbl, seq));
- ASSERT_EQ(seq, 2ull);
- inbl.copy(0, inbl.length(), v);
- ASSERT_EQ("small", v);
- inbl.clear();
- v.clear();
-
- ASSERT_EQ(true, j.read_entry(inbl, seq));
- ASSERT_EQ(seq, 3ull);
- inbl.copy(0, inbl.length(), v);
- ASSERT_EQ("small", v);
- inbl.clear();
- v.clear();
-
- ASSERT_TRUE(!j.read_entry(inbl, seq));
-
- j.make_writeable();
- j.close();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
+
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+ bufferlist bl;
+ bl.append("small");
+ j.submit_entry(1, bl, 0, gb.new_sub());
+ bl.append("small");
+ j.submit_entry(2, bl, 0, gb.new_sub());
+ bl.append("small");
+ j.submit_entry(3, bl, 0, gb.new_sub());
+ gb.activate();
+ wait();
+
+ j.close();
+
+ j.open(1);
+
+ bufferlist inbl;
+ string v;
+ uint64_t seq = 0;
+ ASSERT_EQ(true, j.read_entry(inbl, seq));
+ ASSERT_EQ(seq, 2ull);
+ inbl.copy(0, inbl.length(), v);
+ ASSERT_EQ("small", v);
+ inbl.clear();
+ v.clear();
+
+ ASSERT_EQ(true, j.read_entry(inbl, seq));
+ ASSERT_EQ(seq, 3ull);
+ inbl.copy(0, inbl.length(), v);
+ ASSERT_EQ("small", v);
+ inbl.clear();
+ v.clear();
+
+ ASSERT_TRUE(!j.read_entry(inbl, seq));
+
+ j.make_writeable();
+ j.close();
+ }
}
TEST(TestFileJournal, ReplayCorrupt) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
-
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-
- const char *needle = "i am a needle";
- const char *newneedle = "in a haystack";
- bufferlist bl;
- bl.append(needle);
- j.submit_entry(1, bl, 0, gb.new_sub());
- bl.append(needle);
- j.submit_entry(2, bl, 0, gb.new_sub());
- bl.append(needle);
- j.submit_entry(3, bl, 0, gb.new_sub());
- bl.append(needle);
- j.submit_entry(4, bl, 0, gb.new_sub());
- gb.activate();
- wait();
-
- j.close();
-
- cout << "corrupting journal" << std::endl;
- char buf[1024*128];
- int fd = open(path, O_RDONLY);
- ASSERT_GE(fd, 0);
- int r = safe_read_exact(fd, buf, sizeof(buf));
- ASSERT_EQ(0, r);
- int n = 0;
- for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) {
- if (memcmp(buf+o, needle, strlen(needle)) == 0) {
- if (n >= 2) {
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "true");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
+
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+ const char *needle = "i am a needle";
+ const char *newneedle = "in a haystack";
+ bufferlist bl;
+ bl.append(needle);
+ j.submit_entry(1, bl, 0, gb.new_sub());
+ bl.append(needle);
+ j.submit_entry(2, bl, 0, gb.new_sub());
+ bl.append(needle);
+ j.submit_entry(3, bl, 0, gb.new_sub());
+ bl.append(needle);
+ j.submit_entry(4, bl, 0, gb.new_sub());
+ gb.activate();
+ wait();
+
+ j.close();
+
+ cout << "corrupting journal" << std::endl;
+ char buf[1024*128];
+ int fd = open(path, O_RDONLY);
+ ASSERT_GE(fd, 0);
+ int r = safe_read_exact(fd, buf, sizeof(buf));
+ ASSERT_EQ(0, r);
+ int n = 0;
+ for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) {
+ if (memcmp(buf+o, needle, strlen(needle)) == 0) {
+ if (n >= 2) {
cout << "replacing at offset " << o << std::endl;
memcpy(buf+o, newneedle, strlen(newneedle));
- } else {
+ } else {
cout << "leaving at offset " << o << std::endl;
+ }
+ n++;
}
- n++;
}
+ ASSERT_EQ(n, 4);
+ close(fd);
+ fd = open(path, O_WRONLY);
+ ASSERT_GE(fd, 0);
+ r = safe_write(fd, buf, sizeof(buf));
+ ASSERT_EQ(r, 0);
+ close(fd);
+
+ j.open(1);
+
+ bufferlist inbl;
+ string v;
+ uint64_t seq = 0;
+ ASSERT_EQ(true, j.read_entry(inbl, seq));
+ ASSERT_EQ(seq, 2ull);
+ inbl.copy(0, inbl.length(), v);
+ ASSERT_EQ(needle, v);
+ inbl.clear();
+ v.clear();
+ bool corrupt;
+ ASSERT_FALSE(j.read_entry(inbl, seq, &corrupt));
+ ASSERT_TRUE(corrupt);
+
+ j.make_writeable();
+ j.close();
}
- ASSERT_EQ(n, 4);
- close(fd);
- fd = open(path, O_WRONLY);
- ASSERT_GE(fd, 0);
- r = safe_write(fd, buf, sizeof(buf));
- ASSERT_EQ(r, 0);
- close(fd);
-
- j.open(1);
-
- bufferlist inbl;
- string v;
- uint64_t seq = 0;
- ASSERT_EQ(true, j.read_entry(inbl, seq));
- ASSERT_EQ(seq, 2ull);
- inbl.copy(0, inbl.length(), v);
- ASSERT_EQ(needle, v);
- inbl.clear();
- v.clear();
- ASSERT_TRUE(!j.read_entry(inbl, seq));
-
- j.make_writeable();
- j.close();
}
TEST(TestFileJournal, WriteTrim) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
- list<C_Sync*> ls;
-
- bufferlist bl;
- char foo[1024*1024];
- memset(foo, 1, sizeof(foo));
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
- uint64_t seq = 1, committed = 0;
+ list<C_Sync*> ls;
- for (unsigned i=0; i<size_mb*2; i++) {
- bl.clear();
- bl.push_back(buffer::copy(foo, sizeof(foo)));
- bl.zero();
- ls.push_back(new C_Sync);
- j.submit_entry(seq++, bl, 0, ls.back()->c);
+ bufferlist bl;
+ char foo[1024*1024];
+ memset(foo, 1, sizeof(foo));
+
+ uint64_t seq = 1, committed = 0;
+
+ for (unsigned i=0; i<size_mb*2; i++) {
+ bl.clear();
+ bl.push_back(buffer::copy(foo, sizeof(foo)));
+ bl.zero();
+ ls.push_back(new C_Sync);
+ j.submit_entry(seq++, bl, 0, ls.back()->c);
- while (ls.size() > size_mb/2) {
+ while (ls.size() > size_mb/2) {
+ delete ls.front();
+ ls.pop_front();
+ committed++;
+ j.committed_thru(committed);
+ }
+ }
+
+ while (ls.size()) {
delete ls.front();
ls.pop_front();
- committed++;
- j.committed_thru(committed);
+ j.committed_thru(++committed);
}
- }
- while (ls.size()) {
- delete ls.front();
- ls.pop_front();
- j.committed_thru(committed);
- }
+ ASSERT_TRUE(j.journalq_empty());
- j.close();
+ j.close();
+ }
}
TEST(TestFileJournal, WriteTrimSmall) {
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
+ g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+ g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+ g_ceph_context->_conf->apply_changes(NULL);
- list<C_Sync*> ls;
-
- bufferlist bl;
- char foo[1024*1024];
- memset(foo, 1, sizeof(foo));
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
- uint64_t seq = 1, committed = 0;
+ list<C_Sync*> ls;
- for (unsigned i=0; i<size_mb*2; i++) {
- bl.clear();
- for (int k=0; k<128; k++)
- bl.push_back(buffer::copy(foo, sizeof(foo) / 128));
- bl.zero();
- ls.push_back(new C_Sync);
- j.submit_entry(seq++, bl, 0, ls.back()->c);
+ bufferlist bl;
+ char foo[1024*1024];
+ memset(foo, 1, sizeof(foo));
- while (ls.size() > size_mb/2) {
+ uint64_t seq = 1, committed = 0;
+
+ for (unsigned i=0; i<size_mb*2; i++) {
+ bl.clear();
+ for (int k=0; k<128; k++)
+ bl.push_back(buffer::copy(foo, sizeof(foo) / 128));
+ bl.zero();
+ ls.push_back(new C_Sync);
+ j.submit_entry(seq++, bl, 0, ls.back()->c);
+
+ while (ls.size() > size_mb/2) {
+ delete ls.front();
+ ls.pop_front();
+ committed++;
+ j.committed_thru(committed);
+ }
+ }
+
+ while (ls.size()) {
delete ls.front();
ls.pop_front();
- committed++;
j.committed_thru(committed);
}
- }
- while (ls.size()) {
- delete ls.front();
- ls.pop_front();
- j.committed_thru(committed);
+ j.close();
}
-
- j.close();
}
TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) {
g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
g_ceph_context->_conf->apply_changes(NULL);
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
-
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+ const char *needle = "i am a needle";
+ for (unsigned i = 1; i <= 4; ++i) {
+ bufferlist bl;
+ bl.append(needle);
+ j.submit_entry(i, bl, 0, gb.new_sub());
+ }
+ gb.activate();
+ wait();
- const char *needle = "i am a needle";
- for (unsigned i = 1; i <= 4; ++i) {
bufferlist bl;
- bl.append(needle);
- j.submit_entry(i, bl, 0, gb.new_sub());
+ bl.append("needle");
+ j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+ wait();
+
+ j.close();
+ int fd = open(path, O_WRONLY);
+
+ cout << "corrupting journal" << std::endl;
+ j.open(0);
+ j.corrupt_footer_magic(fd, 2);
+
+ uint64_t seq = 0;
+ bl.clear();
+ bool corrupt = false;
+ bool result = j.read_entry(bl, seq, &corrupt);
+ ASSERT_TRUE(result);
+ ASSERT_EQ(seq, 1UL);
+ ASSERT_FALSE(corrupt);
+
+ result = j.read_entry(bl, seq, &corrupt);
+ ASSERT_FALSE(result);
+ ASSERT_TRUE(corrupt);
+
+ j.make_writeable();
+ j.close();
+ ::close(fd);
}
- gb.activate();
- wait();
-
- bufferlist bl;
- bl.append("needle");
- j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
- wait();
-
- j.close();
- int fd = open(path, O_WRONLY);
-
- cout << "corrupting journal" << std::endl;
- j.open(0);
- j.corrupt_footer_magic(fd, 2);
-
- uint64_t seq = 0;
- bl.clear();
- bool corrupt = false;
- bool result = j.read_entry(bl, seq, &corrupt);
- ASSERT_TRUE(result);
- ASSERT_EQ(seq, 1UL);
- ASSERT_FALSE(corrupt);
-
- result = j.read_entry(bl, seq, &corrupt);
- ASSERT_FALSE(result);
- ASSERT_TRUE(corrupt);
-
- j.make_writeable();
- j.close();
- ::close(fd);
}
TEST(TestFileJournal, ReplayDetectCorruptPayload) {
g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
g_ceph_context->_conf->apply_changes(NULL);
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
-
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+ const char *needle = "i am a needle";
+ for (unsigned i = 1; i <= 4; ++i) {
+ bufferlist bl;
+ bl.append(needle);
+ j.submit_entry(i, bl, 0, gb.new_sub());
+ }
+ gb.activate();
+ wait();
- const char *needle = "i am a needle";
- for (unsigned i = 1; i <= 4; ++i) {
bufferlist bl;
- bl.append(needle);
- j.submit_entry(i, bl, 0, gb.new_sub());
+ bl.append("needle");
+ j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+ wait();
+
+ j.close();
+ int fd = open(path, O_WRONLY);
+
+ cout << "corrupting journal" << std::endl;
+ j.open(0);
+ j.corrupt_payload(fd, 2);
+
+ uint64_t seq = 0;
+ bl.clear();
+ bool corrupt = false;
+ bool result = j.read_entry(bl, seq, &corrupt);
+ ASSERT_TRUE(result);
+ ASSERT_EQ(seq, 1UL);
+ ASSERT_FALSE(corrupt);
+
+ result = j.read_entry(bl, seq, &corrupt);
+ ASSERT_FALSE(result);
+ ASSERT_TRUE(corrupt);
+
+ j.make_writeable();
+ j.close();
+ ::close(fd);
}
- gb.activate();
- wait();
-
- bufferlist bl;
- bl.append("needle");
- j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
- wait();
-
- j.close();
- int fd = open(path, O_WRONLY);
-
- cout << "corrupting journal" << std::endl;
- j.open(0);
- j.corrupt_payload(fd, 2);
-
- uint64_t seq = 0;
- bl.clear();
- bool corrupt = false;
- bool result = j.read_entry(bl, seq, &corrupt);
- ASSERT_TRUE(result);
- ASSERT_EQ(seq, 1UL);
- ASSERT_FALSE(corrupt);
-
- result = j.read_entry(bl, seq, &corrupt);
- ASSERT_FALSE(result);
- ASSERT_TRUE(corrupt);
-
- j.make_writeable();
- j.close();
- ::close(fd);
}
TEST(TestFileJournal, ReplayDetectCorruptHeader) {
g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
g_ceph_context->_conf->apply_changes(NULL);
- fsid.generate_random();
- FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
- ASSERT_EQ(0, j.create());
- j.make_writeable();
-
- C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+ for (unsigned i = 0 ; i < 3; ++i) {
+ SCOPED_TRACE(subtests[i].description);
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+ subtests[i].aio, subtests[i].faio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+ const char *needle = "i am a needle";
+ for (unsigned i = 1; i <= 4; ++i) {
+ bufferlist bl;
+ bl.append(needle);
+ j.submit_entry(i, bl, 0, gb.new_sub());
+ }
+ gb.activate();
+ wait();
- const char *needle = "i am a needle";
- for (unsigned i = 1; i <= 4; ++i) {
bufferlist bl;
- bl.append(needle);
- j.submit_entry(i, bl, 0, gb.new_sub());
+ bl.append("needle");
+ j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+ wait();
+
+ j.close();
+ int fd = open(path, O_WRONLY);
+
+ cout << "corrupting journal" << std::endl;
+ j.open(0);
+ j.corrupt_header_magic(fd, 2);
+
+ uint64_t seq = 0;
+ bl.clear();
+ bool corrupt = false;
+ bool result = j.read_entry(bl, seq, &corrupt);
+ ASSERT_TRUE(result);
+ ASSERT_EQ(seq, 1UL);
+ ASSERT_FALSE(corrupt);
+
+ result = j.read_entry(bl, seq, &corrupt);
+ ASSERT_FALSE(result);
+ ASSERT_TRUE(corrupt);
+
+ j.make_writeable();
+ j.close();
+ ::close(fd);
}
- gb.activate();
- wait();
-
- bufferlist bl;
- bl.append("needle");
- j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
- wait();
-
- j.close();
- int fd = open(path, O_WRONLY);
-
- cout << "corrupting journal" << std::endl;
- j.open(0);
- j.corrupt_header_magic(fd, 2);
-
- uint64_t seq = 0;
- bl.clear();
- bool corrupt = false;
- bool result = j.read_entry(bl, seq, &corrupt);
- ASSERT_TRUE(result);
- ASSERT_EQ(seq, 1UL);
- ASSERT_FALSE(corrupt);
-
- result = j.read_entry(bl, seq, &corrupt);
- ASSERT_FALSE(result);
- ASSERT_TRUE(corrupt);
-
- j.make_writeable();
- j.close();
- ::close(fd);
}
bufferlist enc_out;
std::string error;
- key.encrypt(g_ceph_context, enc_in, enc_out, error);
- if (!error.empty()) {
+ if (key.encrypt(g_ceph_context, enc_in, enc_out, &error) < 0) {
+ assert(!error.empty());
dout(0) << "couldn't encode! error " << error << dendl;
exit(1);
}
dec_in = enc_out;
- key.decrypt(g_ceph_context, dec_in, dec_out, error);
- if (!error.empty()) {
+ if (key.decrypt(g_ceph_context, dec_in, dec_out, &error) < 0) {
+ assert(!error.empty());
dout(0) << "couldn't decode! error " << error << dendl;
exit(1);
}
// Release Inode references
ceph_ll_forget(client, ino, 1);
for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) {
- ceph_ll_forget(client, (*p)->inode, 1);
+ ceph_ll_forget(client, (*p)->inode.get(), 1);
}
ino = NULL;
path.clear();
#include "os/ObjectStore.h"
#include "os/FileStore.h"
+#include "os/FileJournal.h"
#include "osd/PGLog.h"
#include "osd/OSD.h"
const mymagic_t endmagic = (0xecff << 16) | shortmagic;
const int fd_none = INT_MIN;
bool outistty;
+bool dry_run = false;
//The first FIXED_LENGTH bytes are a fixed
//portion of the export output. This includes the overall
map<epoch_t,pg_interval_t> past_intervals;
OSDMap osdmap;
bufferlist osdmap_bl; // Used in lieu of encoding osdmap due to crc checking
+ map<eversion_t, hobject_t> divergent_priors;
metadata_section(__u8 struct_ver, epoch_t map_epoch, const pg_info_t &info,
- const pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals)
+ const pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals,
+ map<eversion_t, hobject_t> &divergent_priors)
: struct_ver(struct_ver),
map_epoch(map_epoch),
info(info),
log(log),
- past_intervals(past_intervals) { }
+ past_intervals(past_intervals),
+ divergent_priors(divergent_priors) { }
metadata_section()
: struct_ver(0),
map_epoch(0) { }
void encode(bufferlist& bl) const {
- ENCODE_START(3, 1, bl);
+ ENCODE_START(4, 1, bl);
::encode(struct_ver, bl);
::encode(map_epoch, bl);
::encode(info, bl);
// Equivalent to osdmap.encode(bl, features); but
// preserving exact layout for CRC checking.
bl.append(osdmap_bl);
+ ::encode(divergent_priors, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(3, bl);
+ DECODE_START(4, bl);
::decode(struct_ver, bl);
::decode(map_epoch, bl);
::decode(info, bl);
} else {
cout << "WARNING: Older export without OSDMap information" << std::endl;
}
+ if (struct_v > 3) {
+ ::decode(divergent_priors, bl);
+ }
DECODE_FINISH(bl);
}
};
++obj) {
if (obj->is_pgmeta())
continue;
- bufferlist attr;
- r = store->getattr(coll, *obj, OI_ATTR, attr);
- if (r < 0) {
- cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
- << cpp_strerror(r) << std::endl;
- return r;
- }
object_info_t oi;
- bufferlist::iterator bp = attr.begin();
- try {
- ::decode(oi, bp);
- } catch (...) {
- r = -EINVAL;
- cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
- << cpp_strerror(r) << std::endl;
- return r;
+ if (coll != META_COLL) {
+ bufferlist attr;
+ r = store->getattr(coll, *obj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
+ << cpp_strerror(r) << std::endl;
+ continue;
+ }
+ bufferlist::iterator bp = attr.begin();
+ try {
+ ::decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
+ << cpp_strerror(r) << std::endl;
+ continue;
+ }
}
r = action.call(store, coll, *obj, oi);
if (r < 0)
return 0;
}
-int action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
+int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug)
+{
+ spg_t pgid;
+ // Scan collections in case this is an ec pool but no shard specified
+ unsigned scanned = 0;
+ int r = 0;
+ vector<coll_t> colls_to_check;
+ vector<coll_t> candidates;
+ r = store->list_collections(candidates);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pgid.parse(pgidstr.c_str());
+ for (vector<coll_t>::iterator i = candidates.begin();
+ i != candidates.end();
+ ++i) {
+ spg_t cand_pgid;
+ snapid_t snap;
+ if (!i->is_pg(cand_pgid, snap))
+ continue;
+ if (snap != CEPH_NOSNAP)
+ continue;
+
+ // If an exact match or treat no shard as any shard
+ if (cand_pgid == pgid ||
+ (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) {
+ colls_to_check.push_back(*i);
+ }
+ }
+
+ if (debug)
+ cerr << colls_to_check.size() << " pgs to scan" << std::endl;
+ for (vector<coll_t>::iterator i = colls_to_check.begin();
+ i != colls_to_check.end();
+ ++i, ++scanned) {
+ if (debug)
+ cerr << "Scanning " << *i << ", " << scanned << "/"
+ << colls_to_check.size() << " completed" << std::endl;
+ r = _action_on_all_objects_in_pg(store, *i, action, debug);
+ if (r < 0)
+ break;
+ }
+ store->sync_and_flush();
+ return r;
+}
+
+int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
{
int r = _action_on_all_objects_in_pg(store, coll, action, debug);
store->sync_and_flush();
for (list<pair<coll_t, ghobject_t> >::const_iterator i = _objects.begin();
i != _objects.end();
++i) {
- if (i != _objects.begin() && human_readable) {
- f->flush(cout);
- cout << std::endl;
- }
f->open_array_section("pgid_object");
- string pgid = i->first.c_str();
- std::size_t pos = pgid.find("_");
- if (pos == string::npos)
- f->dump_string("pgid", pgid);
- else
- f->dump_string("pgid", pgid.substr(0, pos));
+ snapid_t snap;
+ spg_t pgid;
+ bool is_pg = i->first.is_pg(pgid, snap);
+ if (is_pg)
+ f->dump_string("pgid", stringify(pgid));
+ if (!is_pg || !human_readable)
+ f->dump_string("coll", i->first.to_str());
f->open_object_section("ghobject");
i->second.dump(f);
f->close_section();
f->close_section();
+ if (human_readable) {
+ f->flush(cout);
+ cout << std::endl;
+ }
}
- if (!human_readable)
+ if (!human_readable) {
f->close_section();
+ f->flush(cout);
+ cout << std::endl;
+ }
}
};
struct lookup_ghobject : public action_on_object_t {
pgid_object_list _objects;
const string _name;
+ bool _need_snapset;
- lookup_ghobject(const string& name) : _name(name) { }
+ lookup_ghobject(const string& name, bool need_snapset = false) : _name(name),
+ _need_snapset(need_snapset) { }
virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) {
+ if (_need_snapset && !ghobj.hobj.has_snapset())
+ return 0;
if (_name.length() == 0 || ghobj.hobj.oid.name == _name)
_objects.insert(coll, ghobj);
return 0;
template <typename T>
int write_section(sectiontype_t type, const T& obj, int fd) {
+ if (dry_run)
+ return 0;
bufferlist blhdr, bl, blftr;
obj.encode(bl);
header hdr(type, bl.length());
int write_simple(sectiontype_t type, int fd)
{
+ if (dry_run)
+ return 0;
bufferlist hbl;
header hdr(type, 0);
do {
ssize_t bytes = bl.read_fd(fd, max_read);
if (bytes < 0) {
- cerr << "read_fd error " << cpp_strerror(-bytes) << std::endl;
- return 1;
+ cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+ return bytes;
}
if (bytes == 0)
return 0;
}
+void myexit(int ret)
+{
+ if (g_ceph_context)
+ g_ceph_context->put();
+ exit(ret);
+}
+
static void invalid_filestore_path(string &path)
{
cerr << "Invalid filestore path specified: " << path << "\n";
- exit(1);
+ myexit(1);
}
int get_log(ObjectStore *fs, __u8 struct_ver,
coll_t coll, spg_t pgid, const pg_info_t &info,
- PGLog::IndexedLog &log, pg_missing_t &missing)
+ PGLog::IndexedLog &log, pg_missing_t &missing,
+ map<eversion_t, hobject_t> &divergent_priors)
{
- map<eversion_t, hobject_t> divergent_priors;
try {
ostringstream oss;
assert(struct_ver > 0);
}
catch (const buffer::error &e) {
cerr << "read_log threw exception error " << e.what() << std::endl;
- return 1;
+ return -EFAULT;
}
return 0;
}
+void dump_log(Formatter *formatter, ostream &out, pg_log_t &log,
+ pg_missing_t &missing, map<eversion_t, hobject_t> &divergent_priors)
+{
+ formatter->open_object_section("op_log");
+ formatter->open_object_section("pg_log_t");
+ log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(out);
+ formatter->open_object_section("pg_missing_t");
+ missing.dump(formatter);
+ formatter->close_section();
+ formatter->flush(out);
+ formatter->open_object_section("map");
+ formatter->open_array_section("divergent_priors");
+ for (map<eversion_t, hobject_t>::iterator it = divergent_priors.begin();
+ it != divergent_priors.end(); ++ it) {
+ formatter->open_object_section("item");
+ formatter->dump_stream("eversion") << it->first;
+ formatter->dump_stream("hobject") << it->second;
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(out);
+}
+
//Based on RemoveWQ::_process()
void remove_coll(ObjectStore *store, const coll_t &coll)
{
vector<coll_t> ls;
int r = store->list_collections(ls);
if (r < 0) {
- cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(-r)
+ cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r)
<< std::endl;
return r;
}
__u8 struct_v;
r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
if (r < 0) {
- cerr << __func__ << " error on read_info " << cpp_strerror(-r) << std::endl;
+ cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl;
return r;
}
if (struct_v < 8) {
int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid)
{
+ if (!dry_run)
+ finish_remove_pgs(store);
if (!store->collection_exists(coll_t(r_pgid)))
return -ENOENT;
cout << " marking collection for removal" << std::endl;
+ if (dry_run)
+ return 0;
ObjectStore::Transaction *rmt = new ObjectStore::Transaction;
int r = mark_pg_for_removal(store, r_pgid, rmt);
if (r < 0) {
return r;
}
store->apply_transaction(*rmt);
+ finish_remove_pgs(store);
return r;
}
bytes = ebl.read_fd(file_fd, sh.header_size);
if ((size_t)bytes != sh.header_size) {
cerr << "Unexpected EOF" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
decode(ebliter);
bytes = ebl.read_fd(file_fd, sh.footer_size);
if ((size_t)bytes != sh.footer_size) {
cerr << "Unexpected EOF" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
decode(ebliter);
if (magic != endmagic) {
cerr << "Bad footer magic" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
return 0;
past_intervals,
pgmeta_oid,
true);
- if (ret < 0) ret = -ret;
if (ret) cerr << "Failed to write info" << std::endl;
return ret;
}
int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
- pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals)
+ pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals,
+ map<eversion_t, hobject_t> &divergent_priors)
{
int ret = write_info(t, epoch, info, past_intervals);
if (ret)
return ret;
- map<eversion_t, hobject_t> divergent_priors;
coll_t coll(info.pgid);
PGLog::write_log(t, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors);
return 0;
bufferlist hdrbuf;
ret = store->omap_get_header(cid, obj, &hdrbuf, true);
if (ret < 0) {
- cerr << "omap_get_header: " << cpp_strerror(-ret) << std::endl;
+ cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl;
return ret;
}
ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(cid, obj);
if (!iter) {
ret = -ENOENT;
- cerr << "omap_get_iterator: " << cpp_strerror(-ret) << std::endl;
+ cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl;
return ret;
}
iter->seek_to_first();
for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- if (i->is_pgmeta()) {
+ if (i->is_pgmeta() || i->hobj.is_temp()) {
continue;
}
r = export_file(store, coll, *i);
return 0;
}
+int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) {
+ OSDMap::Incremental inc;
+ bufferlist::iterator it = bl.begin();
+ inc.decode(it);
+ if (e == 0) {
+ e = inc.epoch;
+ } else if (e != inc.epoch) {
+ cerr << "incremental.epoch mismatch: "
+ << inc.epoch << " != " << e << std::endl;
+ if (force) {
+ cerr << "But will continue anyway." << std::endl;
+ } else {
+ return -EINVAL;
+ }
+ }
+ const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e);
+ if (!store->exists(META_COLL, inc_oid)) {
+ cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl;
+ if (!force) {
+ return -ENOENT;
+ }
+ cout << "Creating a new epoch." << std::endl;
+ }
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction t;
+ t.write(META_COLL, inc_oid, 0, bl.length(), bl);
+ t.truncate(META_COLL, inc_oid, bl.length());
+ int ret = store->apply_transaction(t);
+ if (ret) {
+ cerr << "Failed to set inc-osdmap (" << inc_oid << "): " << ret << std::endl;
+ } else {
+ cout << "Wrote inc-osdmap." << inc.epoch << std::endl;
+ }
+ return ret;
+}
+
+int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl)
+{
+ if (store->read(META_COLL,
+ OSD::get_inc_osdmap_pobject_name(e),
+ 0, 0, bl) < 0) {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) {
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ if (e == 0) {
+ e = osdmap.get_epoch();
+ } else if (e != osdmap.get_epoch()) {
+ cerr << "osdmap.epoch mismatch: "
+ << e << " != " << osdmap.get_epoch() << std::endl;
+ if (force) {
+ cerr << "But will continue anyway." << std::endl;
+ } else {
+ return -EINVAL;
+ }
+ }
+ const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e);
+ if (!store->exists(META_COLL, full_oid)) {
+ cerr << "osdmap (" << full_oid << ") does not exist." << std::endl;
+ if (!force) {
+ return -ENOENT;
+ }
+ cout << "Creating a new epoch." << std::endl;
+ }
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction t;
+ t.write(META_COLL, full_oid, 0, bl.length(), bl);
+ t.truncate(META_COLL, full_oid, bl.length());
+ int ret = store->apply_transaction(t);
+ if (ret) {
+ cerr << "Failed to set osdmap (" << full_oid << "): " << ret << std::endl;
+ } else {
+ cout << "Wrote osdmap." << osdmap.get_epoch() << std::endl;
+ }
+ return ret;
+}
+
int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl)
{
bool found = store->read(
META_COLL, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
if (!found) {
cerr << "Can't find OSDMap for pg epoch " << e << std::endl;
- return ENOENT;
+ return -ENOENT;
}
osdmap.decode(bl);
if (debug)
//Write super_header with its fixed 16 byte length
void write_super()
{
+ if (dry_run)
+ return;
bufferlist superbl;
super_header sh;
footer ft;
{
PGLog::IndexedLog log;
pg_missing_t missing;
+ map<eversion_t, hobject_t> divergent_priors;
cerr << "Exporting " << pgid << std::endl;
- int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing);
+ int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing,
+ divergent_priors);
if (ret > 0)
return ret;
+ if (debug) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ assert(formatter);
+ dump_log(formatter, cerr, log, missing, divergent_priors);
+ delete formatter;
+ }
write_super();
pg_begin pgb(pgid, superblock);
// The metadata_section is now before files, so import can detect
// errors and abort without wasting time.
- metadata_section ms(struct_ver, map_epoch, info, log, past_intervals);
+ metadata_section ms(struct_ver, map_epoch, info, log, past_intervals, divergent_priors);
ret = add_osdmap(fs, ms);
if (ret)
return ret;
bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH);
if ((size_t)bytes != super_header::FIXED_LENGTH) {
cerr << "Unexpected EOF" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
decode(ebliter);
bytes = bl->read_fd(fd, hdr.size);
if (bytes != hdr.size) {
cerr << "Unexpected EOF" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
if (hdr.size > 0) {
done = true;
break;
default:
- return EFAULT;
+ return -EFAULT;
}
}
return 0;
}
-int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl)
+int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite)
{
bufferlist::iterator ebliter = bl.begin();
object_begin ob;
ob.decode(ebliter);
map<string,bufferptr>::iterator i;
bufferlist abl;
+ bool skipping;
data_section ds;
attr_section as;
ioctx.set_namespace(ob.hoid.hobj.get_namespace());
string msg("Write");
- int ret = ioctx.create(ob.hoid.hobj.oid.name, true);
- if (ret && ret != -EEXIST) {
- cerr << "create failed: " << cpp_strerror(ret) << std::endl;
- return ret;
- }
- if (ret == -EEXIST) {
- msg = "***Overwrite***";
- ret = ioctx.remove(ob.hoid.hobj.oid.name);
- if (ret < 0) {
- cerr << "remove failed: " << cpp_strerror(ret) << std::endl;
- return ret;
+ skipping = false;
+ if (dry_run) {
+ uint64_t psize;
+ time_t pmtime;
+ int ret = ioctx.stat(ob.hoid.hobj.oid.name, &psize, &pmtime);
+ if (ret == 0) {
+ if (no_overwrite)
+ // Could set skipping, but dry-run doesn't change anything either
+ msg = "Skipping existing";
+ else
+ msg = "***Overwrite***";
}
- ret = ioctx.create(ob.hoid.hobj.oid.name, true);
- if (ret < 0) {
+ } else {
+ int ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+ if (ret && ret != -EEXIST) {
cerr << "create failed: " << cpp_strerror(ret) << std::endl;
return ret;
}
+ if (ret == -EEXIST) {
+ if (no_overwrite) {
+ msg = "Skipping existing";
+ skipping = true;
+ } else {
+ msg = "***Overwrite***";
+ ret = ioctx.remove(ob.hoid.hobj.oid.name);
+ if (ret < 0) {
+ cerr << "remove failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+ // If object re-appeared after removal, let's just skip it
+ if (ret == -EEXIST) {
+ skipping = true;
+ msg = "Skipping in-use object";
+ ret = 0;
+ }
+ if (ret < 0) {
+ cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ }
}
cout << msg << " " << ob.hoid << std::endl;
if (need_align) {
if (ds.offset != in_offset) {
cerr << "Discontiguous object data in export" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
assert(ds.databl.length() == ds.len);
databl.claim_append(ds.databl);
if (databl.length() >= alignment) {
uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment;
if (debug) cerr << "write offset=" << out_offset << " len=" << rndlen << std::endl;
- ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset);
- if (ret) {
- cerr << "write failed: " << cpp_strerror(ret) << std::endl;
- return ret;
+ if (!dry_run && !skipping) {
+ ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
}
out_offset += rndlen;
bufferlist n;
}
break;
}
- ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset);
- if (ret) {
- cerr << "write failed: " << cpp_strerror(ret) << std::endl;
- return ret;
+ if (!dry_run && !skipping) {
+ ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
}
break;
case TYPE_ATTRS:
if (debug)
cerr << "\tattrs: len " << as.data.size() << std::endl;
+ if (dry_run || skipping)
+ break;
for (i = as.data.begin(); i != as.data.end(); ++i) {
- if (i->first == "_" || i->first == "snapset")
+ // The user xattrs that we want all begin with "_" with length > 1.
+ // Drop key "_" and all attributes that do not start with '_'
+ if (i->first == "_" || i->first[0] != '_')
continue;
abl.clear();
abl.push_front(i->second);
if (debug)
cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
<< std::endl;
+ if (dry_run || skipping)
+ break;
ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr);
if (ret) {
cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl;
if (debug)
cerr << "\tomap: size " << os.omap.size() << std::endl;
+ if (dry_run || skipping)
+ break;
ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap);
if (ret) {
cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl;
}
break;
case TYPE_OBJECT_END:
+ done = true;
if (need_align && databl.length() > 0) {
assert(databl.length() < alignment);
if (debug) cerr << "END write offset=" << out_offset << " len=" << databl.length() << std::endl;
+ if (dry_run || skipping)
+ break;
ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset);
if (ret) {
cerr << "write failed: " << cpp_strerror(ret) << std::endl;
return ret;
}
}
- done = true;
break;
default:
- return EFAULT;
+ return -EFAULT;
}
}
return 0;
}
-int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
+int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap,
+ bool *skipped_objects)
{
ObjectStore::Transaction tran;
ObjectStore::Transaction *t = &tran;
coll.is_pg_prefix(pg);
SnapMapper mapper(&driver, 0, 0, 0, pg.shard);
+ if (ob.hoid.hobj.is_temp()) {
+ cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl;
+ return -EFAULT;
+ }
assert(g_ceph_context);
if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) {
object_t oid = ob.hoid.hobj.oid;
object_locator_t loc(ob.hoid.hobj);
pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
-
+
spg_t coll_pgid;
snapid_t coll_snap;
if (coll.is_pg(coll_pgid, coll_snap) == false) {
cerr << "INTERNAL ERROR: Bad collection during import" << std::endl;
- return 1;
+ return -EFAULT;
}
if (coll_pgid.shard != ob.hoid.shard_id) {
- cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard
+ cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard
<< " but object shard is " << ob.hoid.shard_id << std::endl;
- return 1;
+ return -EFAULT;
}
-
+
if (coll_pgid.pgid != pgid) {
- cerr << "Skipping object '" << ob.hoid << "' which no longer belongs in exported pg" << std::endl;
+ cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl;
+ *skipped_objects = true;
skip_object(bl);
return 0;
}
}
- t->touch(coll, ob.hoid);
+ if (!dry_run)
+ t->touch(coll, ob.hoid);
cout << "Write " << ob.hoid << std::endl;
}
switch(type) {
case TYPE_DATA:
+ if (dry_run) break;
ret = get_data(store, coll, ob.hoid, t, ebl);
if (ret) return ret;
break;
case TYPE_ATTRS:
+ if (dry_run) break;
ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper);
if (ret) return ret;
break;
case TYPE_OMAP_HDR:
+ if (dry_run) break;
ret = get_omap_hdr(store, coll, ob.hoid, t, ebl);
if (ret) return ret;
break;
case TYPE_OMAP:
+ if (dry_run) break;
ret = get_omap(store, coll, ob.hoid, t, ebl);
if (ret) return ret;
break;
done = true;
break;
default:
- return EFAULT;
+ cerr << "Unknown section type " << type << std::endl;
+ return -EFAULT;
}
}
- store->apply_transaction(*t);
+ if (!dry_run)
+ store->apply_transaction(*t);
return 0;
}
int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms,
- const OSDSuperblock& sb, OSDMap& curmap)
+ const OSDSuperblock& sb, OSDMap& curmap, spg_t pgid)
{
bufferlist::iterator ebliter = bl.begin();
ms.decode(ebliter);
+ spg_t old_pgid = ms.info.pgid;
+ ms.info.pgid = pgid;
#if DIAGNOSTIC
Formatter *formatter = new JSONFormatter(true);
+ cout << "export pgid " << old_pgid << std::endl;
cout << "struct_v " << (int)ms.struct_ver << std::endl;
cout << "map epoch " << ms.map_epoch << std::endl;
formatter->close_section();
formatter->flush(cout);
cout << std::endl;
+
+ formatter->open_array_section("divergent_priors");
+ for (map<eversion_t, hobject_t>::iterator it = ms.divergent_priors.begin();
+ it != ms.divergent_priors.end(); ++ it) {
+ formatter->open_object_section("item");
+ formatter->dump_stream("eversion") << it->first;
+ formatter->dump_stream("hobject") << it->second;
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
#endif
+ if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) {
+ cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl;
+ return -EFAULT;
+ }
+
if (ms.map_epoch > sb.current_epoch) {
- cerr << "ERROR: Export map_epoch " << ms.map_epoch << " > osd epoch " << sb.current_epoch << std::endl;
- return 1;
+ cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl;
+ cerr << "The OSD you are using is older than the exported PG" << std::endl;
+ cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl;
+ return -EINVAL;
}
- // If the osdmap was present in the metadata we can check for splits.
// Pool verified to exist for call to get_pg_num().
- if (ms.map_epoch < sb.current_epoch) {
- bool found_map = false;
+ unsigned new_pg_num = curmap.get_pg_num(pgid.pgid.pool());
+
+ if (pgid.pgid.ps() >= new_pg_num) {
+ cerr << "Illegal pgid, the seed is larger than current pg_num" << std::endl;
+ return -EINVAL;
+ }
+
+ // Old exports didn't include OSDMap, see if we have a copy locally
+ if (ms.osdmap.get_epoch() == 0) {
OSDMap findmap;
bufferlist findmap_bl;
int ret = get_osdmap(store, ms.map_epoch, findmap, findmap_bl);
- if (ret == 0)
- found_map = true;
-
- // Old export didn't include OSDMap
- if (ms.osdmap.get_epoch() == 0) {
- // If we found the map locally and an older export didn't have it,
- // then we'll use the local one.
- if (found_map) {
- ms.osdmap = findmap;
- } else {
- cerr << "WARNING: No OSDMap in old export,"
- " some objects may be ignored due to a split" << std::endl;
- }
+ if (ret == 0) {
+ ms.osdmap = findmap;
+ } else {
+ cerr << "WARNING: No OSDMap in old export,"
+ " some objects may be ignored due to a split" << std::endl;
}
+ }
+
+ // Make sure old_pg_num is 0 in the unusual case that OSDMap not in export
+ // nor can we find a local copy.
+ unsigned old_pg_num = 0;
+ if (ms.osdmap.get_epoch() != 0)
+ old_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool());
+
+ if (debug) {
+ cerr << "old_pg_num " << old_pg_num << std::endl;
+ cerr << "new_pg_num " << new_pg_num << std::endl;
+ cerr << ms.osdmap << std::endl;
+ cerr << curmap << std::endl;
+ }
- // If OSDMap is available check for splits
- if (ms.osdmap.get_epoch()) {
- spg_t parent(ms.info.pgid);
- if (parent.is_split(ms.osdmap.get_pg_num(ms.info.pgid.pgid.m_pool),
- curmap.get_pg_num(ms.info.pgid.pgid.m_pool), NULL)) {
- cerr << "WARNING: Split occurred, some objects may be ignored" << std::endl;
+ // If we have managed to have a good OSDMap we can do these checks
+ if (old_pg_num) {
+ if (old_pgid.pgid.ps() >= old_pg_num) {
+ cerr << "FATAL: pgid invalid for original map epoch" << std::endl;
+ return -EFAULT;
+ }
+ if (pgid.pgid.ps() >= old_pg_num) {
+ cout << "NOTICE: Post split pgid specified" << std::endl;
+ } else {
+ spg_t parent(pgid);
+ if (parent.is_split(old_pg_num, new_pg_num, NULL)) {
+ cerr << "WARNING: Split occurred, some objects may be ignored" << std::endl;
}
}
}
+ if (debug) {
+ cerr << "Import pgid " << ms.info.pgid << std::endl;
+ cerr << "Clearing past_intervals " << ms.past_intervals << std::endl;
+ cerr << "Zero same_interval_since " << ms.info.history.same_interval_since << std::endl;
+ }
+
+ // Let osd recompute past_intervals and same_interval_since
ms.past_intervals.clear();
- ms.info.history.same_interval_since = ms.map_epoch = sb.current_epoch;
+ ms.info.history.same_interval_since = 0;
+
+ if (debug)
+ cerr << "Changing pg epoch " << ms.map_epoch << " to " << sb.current_epoch << std::endl;
+
+ ms.map_epoch = sb.current_epoch;
return 0;
}
-int do_import_rados(string pool)
+int do_import_rados(string pool, bool no_overwrite)
{
bufferlist ebl;
pg_info_t info;
if (sh.magic != super_header::super_magic) {
cerr << "Invalid magic number" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
if (sh.version > super_header::super_ver) {
cerr << "Can't handle export format version=" << sh.version << std::endl;
- return EINVAL;
+ return -EINVAL;
}
//First section must be TYPE_PG_BEGIN
if (ret)
return ret;
if (type != TYPE_PG_BEGIN) {
- return EFAULT;
+ cerr << "Invalid first section type " << type << std::endl;
+ return -EFAULT;
}
bufferlist::iterator ebliter = ebl.begin();
if (!pgid.is_no_shard()) {
cerr << "Importing Erasure Coded shard is not supported" << std::endl;
- exit(1);
+ myexit(1);
}
if (debug) {
if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
cerr << "Export has incompatible features set "
<< pgb.superblock.compat_features << std::endl;
- return 1;
+ return -EINVAL;
}
#endif
}
switch(type) {
case TYPE_OBJECT_BEGIN:
- ret = get_object_rados(ioctx, ebl);
+ ret = get_object_rados(ioctx, ebl, no_overwrite);
if (ret) return ret;
break;
case TYPE_PG_METADATA:
done = true;
break;
default:
- return EFAULT;
+ return -EFAULT;
}
}
return 0;
}
-int do_import(ObjectStore *store, OSDSuperblock& sb)
+
+typedef map<eversion_t, hobject_t> divergent_priors_t;
+
+// out: pg_log_t that only has entries that apply to import_pgid using curmap
+// reject: Entries rejected from "in" are in the reject.log. Other fields not set.
+void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap,
+ const string &hit_set_namespace, const divergent_priors_t &in,
+ divergent_priors_t &out, divergent_priors_t &reject)
+{
+ out.clear();
+ reject.clear();
+
+ for (divergent_priors_t::const_iterator i = in.begin();
+ i != in.end(); ++i) {
+
+ // Reject divergent priors for temporary objects
+ if (i->second.is_temp()) {
+ reject.insert(*i);
+ continue;
+ }
+
+ if (i->second.nspace != hit_set_namespace) {
+ object_t oid = i->second.oid;
+ object_locator_t loc(i->second);
+ pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
+
+ if (import_pgid.pgid == pgid) {
+ out.insert(*i);
+ } else {
+ reject.insert(*i);
+ }
+ } else {
+ out.insert(*i);
+ }
+ }
+}
+
+int do_import(ObjectStore *store, OSDSuperblock& sb, bool force, string pgidstr)
{
bufferlist ebl;
pg_info_t info;
PGLog::IndexedLog log;
+ bool skipped_objects = false;
- finish_remove_pgs(store);
+ if (!dry_run)
+ finish_remove_pgs(store);
int ret = sh.read_super();
if (ret)
if (sh.magic != super_header::super_magic) {
cerr << "Invalid magic number" << std::endl;
- return EFAULT;
+ return -EFAULT;
}
if (sh.version > super_header::super_ver) {
cerr << "Can't handle export format version=" << sh.version << std::endl;
- return EINVAL;
+ return -EINVAL;
}
//First section must be TYPE_PG_BEGIN
if (ret)
return ret;
if (type != TYPE_PG_BEGIN) {
- return EFAULT;
+ cerr << "Invalid first section type " << type << std::endl;
+ return -EFAULT;
}
bufferlist::iterator ebliter = ebl.begin();
pg_begin pgb;
pgb.decode(ebliter);
spg_t pgid = pgb.pgid;
+ spg_t orig_pgid = pgid;
+
+ if (pgidstr.length()) {
+ spg_t user_pgid;
+
+ bool ok = user_pgid.parse(pgidstr.c_str());
+ // This succeeded in main() already
+ assert(ok);
+ if (pgid != user_pgid) {
+ if (pgid.pool() != user_pgid.pool()) {
+ cerr << "Can't specify a different pgid pool, must be " << pgid.pool() << std::endl;
+ return -EINVAL;
+ }
+ if (pgid.is_no_shard() && !user_pgid.is_no_shard()) {
+ cerr << "Can't specify a sharded pgid with a non-sharded export" << std::endl;
+ return -EINVAL;
+ }
+ // Get shard from export information if not specified
+ if (!pgid.is_no_shard() && user_pgid.is_no_shard()) {
+ user_pgid.shard = pgid.shard;
+ }
+ if (pgid.shard != user_pgid.shard) {
+ cerr << "Can't specify a different shard, must be " << pgid.shard << std::endl;
+ return -EINVAL;
+ }
+ pgid = user_pgid;
+ }
+ }
if (!pgb.superblock.cluster_fsid.is_zero()
&& pgb.superblock.cluster_fsid != sb.cluster_fsid) {
cerr << "Export came from different cluster with fsid "
<< pgb.superblock.cluster_fsid << std::endl;
- return 1;
+ return -EINVAL;
}
if (debug) {
cerr << "OSD requires sharding to be enabled" << std::endl;
cerr << std::endl;
cerr << "If you wish to import, first do 'ceph-objectstore-tool...--op set-allow-sharded-objects'" << std::endl;
+ return -EINVAL;
}
- return 11; // Assume no +EAGAIN gets to end of main() until we clean up error code handling
+ // Let them import if they specify the --force option
+ if (!force)
+ return 11; // Positive return means exit status
}
// Don't import if pool no longer exists
if (!curmap.have_pg_pool(pgid.pgid.m_pool)) {
cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl;
// Special exit code for this error, used by test code
- return 10; // Assume no +ECHILD gets to end of main() until we clean up error code handling
+ return 10; // Positive return means exit status
}
ghobject_t pgmeta_oid = pgid.make_pgmeta_oid();
coll_t coll(pgid);
if (store->collection_exists(coll)) {
cerr << "pgid " << pgid << " already exists" << std::endl;
- return 1;
+ return -EEXIST;
}
- ObjectStore::Transaction *t = new ObjectStore::Transaction;
- PG::_create(*t, pgid);
- PG::_init(*t, pgid, NULL);
+ if (!dry_run) {
+ ObjectStore::Transaction *t = new ObjectStore::Transaction;
+ PG::_create(*t, pgid);
+ PG::_init(*t, pgid, NULL);
- // mark this coll for removal until we're done
- map<string,bufferlist> values;
- ::encode((char)1, values["_remove"]);
- t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
+ // mark this coll for removal until we're done
+ map<string,bufferlist> values;
+ ::encode((char)1, values["_remove"]);
+ t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
- store->apply_transaction(*t);
- delete t;
+ store->apply_transaction(*t);
+ delete t;
+ }
- cout << "Importing pgid " << pgid << std::endl;
+ cout << "Importing pgid " << pgid;
+ if (orig_pgid != pgid) {
+ cout << " exported as " << orig_pgid;
+ }
+ cout << std::endl;
bool done = false;
bool found_metadata = false;
}
switch(type) {
case TYPE_OBJECT_BEGIN:
- ret = get_object(store, coll, ebl, curmap);
+ ret = get_object(store, coll, ebl, curmap, &skipped_objects);
if (ret) return ret;
break;
case TYPE_PG_METADATA:
- ret = get_pg_metadata(store, ebl, ms, sb, curmap);
+ ret = get_pg_metadata(store, ebl, ms, sb, curmap, pgid);
if (ret) return ret;
found_metadata = true;
break;
done = true;
break;
default:
- return EFAULT;
+ cerr << "Unknown section type " << type << std::endl;
+ return -EFAULT;
}
}
if (!found_metadata) {
cerr << "Missing metadata section" << std::endl;
- return EFAULT;
- }
+ return -EFAULT;
+ }
+
+ ObjectStore::Transaction t;
+ if (!dry_run) {
+ pg_log_t newlog, reject;
+ pg_log_t::filter_log(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace,
+ ms.log, newlog, reject);
+ if (debug) {
+ for (list<pg_log_entry_t>::iterator i = newlog.log.begin();
+ i != newlog.log.end(); ++i)
+ cerr << "Keeping log entry " << *i << std::endl;
+ for (list<pg_log_entry_t>::iterator i = reject.log.begin();
+ i != reject.log.end(); ++i)
+ cerr << "Skipping log entry " << *i << std::endl;
+ }
- pg_log_t newlog, reject;
- pg_log_t::filter_log(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace,
- ms.log, newlog, reject);
- if (debug) {
- for (list<pg_log_entry_t>::iterator i = newlog.log.begin();
- i != newlog.log.end(); ++i)
- cerr << "Keeping log entry " << *i << std::endl;
- for (list<pg_log_entry_t>::iterator i = reject.log.begin();
- i != reject.log.end(); ++i)
- cerr << "Skipping log entry " << *i << std::endl;
- }
+ divergent_priors_t newdp, rejectdp;
+ filter_divergent_priors(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace,
+ ms.divergent_priors, newdp, rejectdp);
+ ms.divergent_priors = newdp;
+ if (debug) {
+ for (divergent_priors_t::iterator i = newdp.begin();
+ i != newdp.end(); ++i)
+ cerr << "Keeping divergent_prior " << *i << std::endl;
+ for (divergent_priors_t::iterator i = rejectdp.begin();
+ i != rejectdp.end(); ++i)
+ cerr << "Skipping divergent_prior " << *i << std::endl;
+ }
- t = new ObjectStore::Transaction;
- ret = write_pg(*t, ms.map_epoch, ms.info, newlog, ms.past_intervals);
- if (ret) return ret;
+ if (debug) {
+ pg_missing_t missing;
+ Formatter *formatter = Formatter::create("json-pretty");
+ dump_log(formatter, cerr, newlog, missing, ms.divergent_priors);
+ delete formatter;
+ }
+
+ // Just like a split invalidate stats since the object count is changed
+ if (skipped_objects)
+ ms.info.stats.stats_invalid = true;
+
+ ret = write_pg(t, ms.map_epoch, ms.info, newlog, ms.past_intervals, ms.divergent_priors);
+ if (ret) return ret;
+ }
// done, clear removal flag
if (debug)
cerr << "done, clearing removal flag" << std::endl;
- set<string> remove;
- remove.insert("_remove");
- t->omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove);
- store->apply_transaction(*t);
- delete t;
+
+ if (!dry_run) {
+ set<string> remove;
+ remove.insert("_remove");
+ t.omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove);
+ store->apply_transaction(t);
+ }
return 0;
}
-int do_list(ObjectStore *store, string pgidstr, string object, Formatter *formatter, bool debug, bool human_readable)
+int do_list(ObjectStore *store, string pgidstr, string object,
+ Formatter *formatter, bool debug, bool human_readable, bool head)
{
int r;
- lookup_ghobject lookup(object);
+ lookup_ghobject lookup(object, head);
if (pgidstr.length() > 0) {
- spg_t pgid;
- pgid.parse(pgidstr.c_str());
- r = action_on_all_objects_in_pg(store, coll_t(pgid), lookup, debug);
+ r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
} else {
r = action_on_all_objects(store, lookup, debug);
}
return r;
lookup.dump(formatter, human_readable);
formatter->flush(cout);
- cout << std::endl;
+ return 0;
+}
+
+int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable)
+{
+ int r;
+ lookup_ghobject lookup(object);
+ r = action_on_all_objects_in_exact_pg(store, META_COLL, lookup, debug);
+ if (r)
+ return r;
+ lookup.dump(formatter, human_readable);
+ formatter->flush(cout);
return 0;
}
int r = store->stat(coll, ghobj, &st);
if (r < 0) {
- cerr << "remove: " << cpp_strerror(-r) << std::endl;
+ cerr << "remove: " << cpp_strerror(r) << std::endl;
return r;
}
+ cout << "remove " << ghobj << std::endl;
+ if (dry_run)
+ return 0;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
OSDriver::OSTransaction _t(driver.get_transaction(t));
- cout << "remove " << ghobj << std::endl;
r = mapper.remove_oid(ghobj.hobj, &_t);
- if (r != 0 && r != -ENOENT) {
- cerr << "remove_oid returned " << cpp_strerror(-r) << std::endl;
+ if (r < 0 && r != -ENOENT) {
+ cerr << "remove_oid returned " << cpp_strerror(r) << std::endl;
return r;
}
map<string,bufferptr> aset;
int r = store->getattrs(coll, ghobj, aset);
if (r < 0) {
- cerr << "getattrs: " << cpp_strerror(-r) << std::endl;
+ cerr << "getattrs: " << cpp_strerror(r) << std::endl;
return r;
}
int ret = store->stat(coll, ghobj, &st);
if (ret < 0) {
- cerr << "get-bytes: " << cpp_strerror(-ret) << std::endl;
- return 1;
+ cerr << "get-bytes: " << cpp_strerror(ret) << std::endl;
+ return ret;
}
total = st.st_size;
ret = write(fd, rawdatabl.c_str(), ret);
if (ret == -1) {
perror("write");
- return 1;
+ return -errno;
}
}
if (debug)
cerr << "Write " << ghobj << std::endl;
- t->touch(coll, ghobj);
- t->truncate(coll, ghobj, 0);
+ if (!dry_run) {
+ t->touch(coll, ghobj);
+ t->truncate(coll, ghobj, 0);
+ }
uint64_t offset = 0;
bufferlist rawdatabl;
rawdatabl.clear();
ssize_t bytes = rawdatabl.read_fd(fd, max_read);
if (bytes < 0) {
- cerr << "read_fd error " << cpp_strerror(-bytes) << std::endl;
- return 1;
+ cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+ return bytes;
}
if (bytes == 0)
if (debug)
cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl;
- t->write(coll, ghobj, offset, bytes, rawdatabl);
+ if (!dry_run)
+ t->write(coll, ghobj, offset, bytes, rawdatabl);
offset += bytes;
// XXX: Should we apply_transaction() every once in a while for very large files
} while(true);
- store->apply_transaction(*t);
+ if (!dry_run)
+ store->apply_transaction(*t);
return 0;
}
int r = store->getattr(coll, ghobj, key.c_str(), bp);
if (r < 0) {
- cerr << "getattr: " << cpp_strerror(-r) << std::endl;
+ cerr << "getattr: " << cpp_strerror(r) << std::endl;
return r;
}
if (debug)
cerr << "Setattr " << ghobj << std::endl;
- if (get_fd_data(fd, bl))
- return 1;
+ int ret = get_fd_data(fd, bl);
+ if (ret < 0)
+ return ret;
+
+ if (dry_run)
+ return 0;
t->touch(coll, ghobj);
if (debug)
cerr << "Rmattr " << ghobj << std::endl;
+ if (dry_run)
+ return 0;
+
t->rmattr(coll, ghobj, key);
store->apply_transaction(*t);
int r = store->omap_get_values(coll, ghobj, keys, &out);
if (r < 0) {
- cerr << "omap_get_values: " << cpp_strerror(-r) << std::endl;
+ cerr << "omap_get_values: " << cpp_strerror(r) << std::endl;
return r;
}
if (debug)
cerr << "Set_omap " << ghobj << std::endl;
- if (get_fd_data(fd, valbl))
- return 1;
+ int ret = get_fd_data(fd, valbl);
+ if (ret < 0)
+ return ret;
attrset.insert(pair<string, bufferlist>(key, valbl));
+ if (dry_run)
+ return 0;
+
t->touch(coll, ghobj);
t->omap_setkeys(coll, ghobj, attrset);
if (debug)
cerr << "Rm_omap " << ghobj << std::endl;
+ if (dry_run)
+ return 0;
+
t->omap_rmkeys(coll, ghobj, keys);
store->apply_transaction(*t);
int r = store->omap_get_header(coll, ghobj, &hdrbl, true);
if (r < 0) {
- cerr << "omap_get_header: " << cpp_strerror(-r) << std::endl;
+ cerr << "omap_get_header: " << cpp_strerror(r) << std::endl;
return r;
}
if (debug)
cerr << "Omap_setheader " << ghobj << std::endl;
- if (get_fd_data(fd, hdrbl))
- return 1;
+ int ret = get_fd_data(fd, hdrbl);
+ if (ret)
+ return ret;
+
+ if (dry_run)
+ return 0;
t->touch(coll, ghobj);
return 0;
}
-struct do_list_lost : public action_on_object_t {
- virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) {
- if (oi.is_lost())
- cout << coll << "/" << ghobj << " is lost" << std::endl;
- return 0;
- }
-};
-
struct do_fix_lost : public action_on_object_t {
virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) {
if (oi.is_lost()) {
- cout << coll << "/" << ghobj << " is lost, fixing" << std::endl;
+ cout << coll << "/" << ghobj << " is lost";
+ if (!dry_run)
+ cout << ", fixing";
+ cout << std::endl;
+ if (dry_run)
+ return 0;
oi.clear_flag(object_info_t::FLAG_LOST);
bufferlist bl;
::encode(oi, bl);
}
};
+int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false)
+{
+ bufferlist attr;
+ int r = store->getattr(coll, ghobj, SS_ATTR, attr);
+ if (r < 0) {
+ if (!silent)
+ cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ bufferlist::iterator bp = attr.begin();
+ try {
+ ::decode(ss, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
+{
+ int r = 0;
+ formatter->open_object_section("obj");
+ formatter->open_object_section("id");
+ ghobj.dump(formatter);
+ formatter->close_section();
+
+ bufferlist attr;
+ int gr = store->getattr(coll, ghobj, OI_ATTR, attr);
+ if (gr < 0) {
+ r = gr;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ object_info_t oi;
+ bufferlist::iterator bp = attr.begin();
+ try {
+ ::decode(oi, bp);
+ formatter->open_object_section("info");
+ oi.dump(formatter);
+ formatter->close_section();
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ struct stat st;
+ int sr = store->stat(coll, ghobj, &st, true);
+ if (sr < 0) {
+ r = sr;
+ cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ formatter->open_object_section("stat");
+ formatter->dump_int("size", st.st_size);
+ formatter->dump_int("blksize", st.st_blksize);
+ formatter->dump_int("blocks", st.st_blocks);
+ formatter->dump_int("nlink", st.st_nlink);
+ formatter->close_section();
+ }
+
+ if (ghobj.hobj.has_snapset()) {
+ SnapSet ss;
+ int snr = get_snapset(store, coll, ghobj, ss);
+ if (snr < 0) {
+ r = snr;
+ } else {
+ formatter->open_object_section("SnapSet");
+ ss.dump(formatter);
+ formatter->close_section();
+ }
+ }
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ return r;
+}
+
+int set_size(ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter)
+{
+ if (ghobj.hobj.is_snapdir()) {
+ cerr << "Can't set the size of a snapdir" << std::endl;
+ return -EINVAL;
+ }
+ bufferlist attr;
+ int r = store->getattr(coll, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ bufferlist::iterator bp = attr.begin();
+ try {
+ ::decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ struct stat st;
+ r = store->stat(coll, ghobj, &st, true);
+ if (r < 0) {
+ cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ ghobject_t head(ghobj);
+ SnapSet ss;
+ bool found_head = true;
+ map<snapid_t, uint64_t>::iterator csi;
+ bool is_snap = ghobj.hobj.is_snap();
+ if (is_snap) {
+ head.hobj = head.hobj.get_head();
+ r = get_snapset(store, coll, head, ss, true);
+ if (r < 0 && r != -ENOENT) {
+ // Requested get_snapset() silent, so if not -ENOENT show error
+ cerr << "Error getting snapset on : " << make_pair(coll, head) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (r == -ENOENT) {
+ head.hobj = head.hobj.get_snapdir();
+ r = get_snapset(store, coll, head, ss);
+ if (r < 0)
+ return r;
+ found_head = false;
+ } else {
+ found_head = true;
+ }
+ csi = ss.clone_size.find(ghobj.hobj.snap);
+ if (csi == ss.clone_size.end()) {
+ cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl;
+ return -EINVAL;
+ }
+ }
+ if ((uint64_t)st.st_size == setsize && oi.size == setsize
+ && (!is_snap || csi->second == setsize)) {
+ cout << "Size of object is already " << setsize << std::endl;
+ return 0;
+ }
+ cout << "Setting size to " << setsize << ", stat size " << st.st_size
+ << ", obj info size " << oi.size;
+ if (is_snap) {
+ cout << ", " << (found_head ? "head" : "snapdir")
+ << " clone_size " << csi->second;
+ csi->second = setsize;
+ }
+ cout << std::endl;
+ if (!dry_run) {
+ attr.clear();
+ oi.size = setsize;
+ ::encode(oi, attr);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ t.truncate(coll, ghobj, setsize);
+ if (is_snap) {
+ bufferlist snapattr;
+ snapattr.clear();
+ ::encode(ss, snapattr);
+ t.setattr(coll, head, SS_ATTR, snapattr);
+ }
+ r = store->apply_transaction(t);
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj,
+ string arg)
+{
+ SnapSet ss;
+ int ret = get_snapset(store, coll, ghobj, ss);
+ if (ret < 0)
+ return ret;
+
+ // Use "head" to set head_exists incorrectly
+ if (arg == "corrupt" || arg == "head")
+ ss.head_exists = !ghobj.hobj.is_head();
+ else if (ss.head_exists != ghobj.hobj.is_head()) {
+ cerr << "Correcting head_exists, set to "
+ << (ghobj.hobj.is_head() ? "true" : "false") << std::endl;
+ ss.head_exists = ghobj.hobj.is_head();
+ }
+ // Use "corrupt" to clear entire SnapSet
+ // Use "seq" to just corrupt SnapSet.seq
+ if (arg == "corrupt" || arg == "seq")
+ ss.seq = 0;
+ // Use "snaps" to just clear SnapSet.snaps
+ if (arg == "corrupt" || arg == "snaps")
+ ss.snaps.clear();
+ // By default just clear clone, clone_overlap and clone_size
+ if (arg == "corrupt")
+ arg = "";
+ if (arg == "" || arg == "clones")
+ ss.clones.clear();
+ if (arg == "" || arg == "clone_overlap")
+ ss.clone_overlap.clear();
+ if (arg == "" || arg == "clone_size")
+ ss.clone_size.clear();
+ // Break all clone sizes by adding 1
+ if (arg == "size") {
+ for (map<snapid_t, uint64_t>::iterator i = ss.clone_size.begin();
+ i != ss.clone_size.end(); ++i)
+ ++(i->second);
+ }
+
+ if (!dry_run) {
+ bufferlist bl;
+ ::encode(ss, bl);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, SS_ATTR, bl);
+ int r = store->apply_transaction(t);
+ if (r < 0) {
+ cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+vector<snapid_t>::iterator find(vector<snapid_t> &v, snapid_t clid)
+{
+ return std::find(v.begin(), v.end(), clid);
+}
+
+map<snapid_t, interval_set<uint64_t> >::iterator
+find(map<snapid_t, interval_set<uint64_t> > &m, snapid_t clid)
+{
+ return m.find(clid);
+}
+
+map<snapid_t, uint64_t>::iterator find(map<snapid_t, uint64_t> &m,
+ snapid_t clid)
+{
+ return m.find(clid);
+}
+
+template<class T>
+int remove_from(T &mv, string name, snapid_t cloneid, bool force)
+{
+ typename T::iterator i = find(mv, cloneid);
+ if (i != mv.end()) {
+ mv.erase(i);
+ } else {
+ cerr << "Clone " << cloneid << " doesn't exist in " << name;
+ if (force) {
+ cerr << " (ignored)" << std::endl;
+ return 0;
+ }
+ cerr << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int remove_clone(ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force)
+{
+ // XXX: Don't allow this if in a cache tier or former cache tier
+ // bool allow_incomplete_clones() const {
+ // return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+
+ SnapSet snapset;
+ int ret = get_snapset(store, coll, ghobj, snapset);
+ if (ret < 0)
+ return ret;
+
+ // Derived from trim_object()
+ // ...from snapset
+ vector<snapid_t>::iterator p;
+ for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p)
+ if (*p == cloneid)
+ break;
+ if (p == snapset.clones.end()) {
+ cerr << "Clone " << cloneid << " not present";
+ return -ENOENT;
+ }
+ if (p != snapset.clones.begin()) {
+ // not the oldest... merge overlap into next older clone
+ vector<snapid_t>::iterator n = p - 1;
+ hobject_t prev_coid = ghobj.hobj;
+ prev_coid.snap = *n;
+ //bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+ //if (adjust_prev_bytes)
+ // ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
+
+ snapset.clone_overlap[*n].intersection_of(
+ snapset.clone_overlap[*p]);
+
+ //if (adjust_prev_bytes)
+ // ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
+ }
+
+ ret = remove_from(snapset.clones, "clones", cloneid, force);
+ if (ret) return ret;
+ ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force);
+ if (ret) return ret;
+ ret = remove_from(snapset.clone_size, "clone_size", cloneid, force);
+ if (ret) return ret;
+
+ if (dry_run)
+ return 0;
+
+ bufferlist bl;
+ ::encode(snapset, bl);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, SS_ATTR, bl);
+ int r = store->apply_transaction(t);
+ if (r < 0) {
+ cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ cout << "Removal of clone " << cloneid << " complete" << std::endl;
+ cout << "Use pg repair after OSD restarted to correct stat information" << std::endl;
+ return 0;
+}
+
void usage(po::options_description &desc)
{
cerr << std::endl;
cerr << "ceph-objectstore-tool ... <object> list-attrs" << std::endl;
cerr << "ceph-objectstore-tool ... <object> list-omap" << std::endl;
cerr << "ceph-objectstore-tool ... <object> remove" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> dump" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-size" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> remove-clone-metadata <cloneid>" << std::endl;
cerr << std::endl;
cerr << "ceph-objectstore-tool import-rados <pool> [file]" << std::endl;
cerr << std::endl;
cerr << "by --op list." << std::endl;
cerr << "<object> can be an object name which will be looked up in all" << std::endl;
cerr << "the OSD's PGs." << std::endl;
+ cerr << "<object> can be the empty string ('') which with a provided pgid " << std::endl;
+ cerr << "specifies the pgmeta object" << std::endl;
cerr << std::endl;
cerr << "The optional [file] argument will read stdin or write stdout" << std::endl;
cerr << "if not specified or if '-' specified." << std::endl;
- exit(1);
}
bool ends_with(const string& check, const string& ending)
return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size());
}
+// Based on FileStore::dump_journal(), set-up enough to only dump
+int mydump_journal(Formatter *f, string journalpath, bool m_journal_dio)
+{
+ int r;
+
+ if (!journalpath.length())
+ return -EINVAL;
+
+ FileJournal *journal = new FileJournal(uuid_d(), NULL, NULL, journalpath.c_str(), m_journal_dio);
+ r = journal->_fdump(*f, false);
+ delete journal;
+ return r;
+}
+
int main(int argc, char **argv)
{
string dpath, jpath, pgidstr, op, file, object, objcmd, arg1, arg2, type, format;
spg_t pgid;
+ unsigned epoch = 0;
ghobject_t ghobj;
- bool human_readable;
+ bool human_readable, no_overwrite;
+ bool force;
Formatter *formatter;
+ bool head;
po::options_description desc("Allowed options");
desc.add_options()
("journal-path", po::value<string>(&jpath),
"path to journal, mandatory for filestore type")
("pgid", po::value<string>(&pgidstr),
- "PG id, mandatory except for import, list-lost, fix-lost, list-pgs, set-allow-sharded-objects")
+ "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete")
("op", po::value<string>(&op),
- "Arg is one of [info, log, remove, export, import, list, list-lost, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects]")
+ "Arg is one of [info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, "
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete]")
+ ("epoch", po::value<unsigned>(&epoch),
+ "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
("file", po::value<string>(&file),
- "path of file to export or import")
+ "path of file to export, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
("format", po::value<string>(&format)->default_value("json-pretty"),
"Output format which may be json, json-pretty, xml, xml-pretty")
("debug", "Enable diagnostic output to stderr")
+ ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE")
("skip-journal-replay", "Disable journal replay")
("skip-mount-omap", "Disable mounting of omap")
+ ("head", "Find head/snapdir when searching for objects by name")
+ ("dry-run", "Don't modify the objectstore")
+ ("no-overwrite", "For import-rados don't overwrite existing files")
;
po::options_description positional("Positional options");
positional.add_options()
- ("object", po::value<string>(&object), "object name or ghobject in json")
+ ("object", po::value<string>(&object), "'' for pgmeta_oid, object name or ghobject in json")
("objcmd", po::value<string>(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]")
("arg1", po::value<string>(&arg1), "arg1 based on cmd")
("arg2", po::value<string>(&arg2), "arg2 based on cmd")
po::include_positional);
} catch(po::error &e) {
std::cerr << e.what() << std::endl;
- return 1;
+ myexit(1);
}
if (vm.count("help")) {
usage(desc);
+ myexit(1);
}
if (!vm.count("debug")) {
debug = true;
}
+ if (!vm.count("force")) {
+ force = false;
+ } else {
+ force = true;
+ }
+
+ no_overwrite = false;
+ if (vm.count("no-overwrite"))
+ no_overwrite = true;
+ if (vm.count("dry-run"))
+ dry_run = true;
+ osflagbits_t flags = 0;
+ if (dry_run || vm.count("skip-journal-replay"))
+ flags |= SKIP_JOURNAL_REPLAY;
+ if (vm.count("skip-mount-omap"))
+ flags |= SKIP_MOUNT_OMAP;
+
+ head = (vm.count("head") > 0);
+
vector<const char *> ceph_options;
env_to_vec(ceph_options);
ceph_options.reserve(ceph_options.size() + ceph_option_strings.size());
if (object == "import-rados") {
if (vm.count("objcmd") == 0) {
cerr << "ceph-objectstore-tool import-rados <pool> [file]" << std::endl;
- exit(1);
+ myexit(1);
}
string pool = objcmd;
if (arg1 == "-") {
if (isatty(STDIN_FILENO)) {
cerr << "stdin is a tty and no file specified" << std::endl;
- exit(1);
+ myexit(1);
}
file_fd = STDIN_FILENO;
} else {
file_fd = open(arg1.c_str(), O_RDONLY);
if (file_fd < 0) {
perror("open");
- return 1;
+ myexit(1);
}
}
global_init(NULL, ceph_options, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
common_init_finish(g_ceph_context);
- int ret = do_import_rados(pool);
+ int ret = do_import_rados(pool, no_overwrite);
if (ret == 0)
cout << "Import successful" << std::endl;
- return ret != 0;
+ myexit(ret != 0);
}
- if (!vm.count("data-path")) {
- cerr << "Must provide --data-path" << std::endl;
- usage(desc);
- }
if (!vm.count("type")) {
type = "filestore";
}
- if (type == "filestore" && !vm.count("journal-path")) {
- cerr << "Must provide --journal-path" << std::endl;
+ if (!vm.count("data-path") &&
+ !(op == "dump-journal" && type == "filestore")) {
+ cerr << "Must provide --data-path" << std::endl;
usage(desc);
+ myexit(1);
}
- if (op != "list" && vm.count("object") && !vm.count("objcmd")) {
- cerr << "Invalid syntax, missing command" << std::endl;
+ if (type == "filestore" && !vm.count("journal-path")) {
+ cerr << "Must provide --journal-path" << std::endl;
usage(desc);
+ myexit(1);
}
- if (!vm.count("op") && !(vm.count("object") && vm.count("objcmd"))) {
+ if (!vm.count("op") && !vm.count("object")) {
cerr << "Must provide --op or object command..." << std::endl;
usage(desc);
+ myexit(1);
}
if (op != "list" && vm.count("op") && vm.count("object")) {
cerr << "Can't specify both --op and object command syntax" << std::endl;
usage(desc);
+ myexit(1);
+ }
+ if (op != "list" && vm.count("object") && !vm.count("objcmd")) {
+ cerr << "Invalid syntax, missing command" << std::endl;
+ usage(desc);
+ myexit(1);
}
outistty = isatty(STDOUT_FILENO);
file_fd = fd_none;
- if (op == "export") {
+ if ((op == "export" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
if (!vm.count("file") || file == "-") {
if (outistty) {
cerr << "stdout is a tty and no --file filename specified" << std::endl;
- exit(1);
+ myexit(1);
}
file_fd = STDOUT_FILENO;
} else {
file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
}
- } else if (op == "import") {
+ } else if (op == "import" || op == "set-osdmap" || op == "set-inc-osdmap") {
if (!vm.count("file") || file == "-") {
if (isatty(STDIN_FILENO)) {
cerr << "stdin is a tty and no --file filename specified" << std::endl;
- exit(1);
+ myexit(1);
}
file_fd = STDIN_FILENO;
} else {
}
}
- if (vm.count("file") && file_fd == fd_none) {
- cerr << "--file option only applies to import or export" << std::endl;
- return 1;
+ if (vm.count("file") && file_fd == fd_none && !dry_run) {
+ cerr << "--file option only applies to import, export, "
+ << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
+ myexit(1);
}
if (file_fd != fd_none && file_fd < 0) {
- perror("open");
- return 1;
+ string err = string("file: ") + file;
+ perror(err.c_str());
+ myexit(1);
}
- if (dpath.length() == 0) {
- cerr << "Invalid params" << std::endl;
- return 1;
- }
-
- osflagbits_t flags = 0;
- if (vm.count("skip-journal-replay"))
- flags |= SKIP_JOURNAL_REPLAY;
- if (vm.count("skip-mount-omap"))
- flags |= SKIP_MOUNT_OMAP;
-
global_init(
NULL, ceph_options, CEPH_ENTITY_TYPE_OSD,
CODE_ENVIRONMENT_UTILITY_NODOUT, 0);
}
g_conf->apply_changes(NULL);
+ // Special list handling. Treating pretty_format as human readable,
+ // with one object per line and not an enclosing array.
+ human_readable = ends_with(format, "-pretty");
+ if ((op == "list" || op == "meta-list") && human_readable) {
+ // Remove -pretty from end of format which we know is there
+ format = format.substr(0, format.size() - strlen("-pretty"));
+ }
+
+ formatter = Formatter::create(format);
+ if (formatter == NULL) {
+ cerr << "unrecognized format: " << format << std::endl;
+ myexit(1);
+ }
+
+ // Special handling for filestore journal, so we can dump it without mounting
+ if (op == "dump-journal" && type == "filestore") {
+ int ret = mydump_journal(formatter, jpath, g_conf->journal_dio);
+ if (ret < 0) {
+ cerr << "journal-path: " << jpath << ": "
+ << cpp_strerror(ret) << std::endl;
+ myexit(1);
+ }
+ formatter->flush(cout);
+ myexit(0);
+ }
+
//Verify that data-path really exists
struct stat st;
if (::stat(dpath.c_str(), &st) == -1) {
- perror("data-path");
- exit(1);
+ string err = string("data-path: ") + dpath;
+ perror(err.c_str());
+ myexit(1);
}
//Verify data data-path really is a filestore
if (type == "filestore") {
}
}
- if (op == "import" && pgidstr.length()) {
- cerr << "--pgid option invalid with import" << std::endl;
- return 1;
+ if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) {
+ cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
+ myexit(1);
}
ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
if (fs == NULL) {
cerr << "Must provide --type (filestore, memstore, keyvaluestore)" << std::endl;
- exit(1);
+ if (type == "keyvaluestore") {
+ cerr << "Add \"keyvaluestore\" to "
+ << "enable_experimental_unrecoverable_data_corrupting_features"
+ << std::endl;
+ }
+ myexit(1);
}
- int r = fs->mount();
- if (r < 0) {
- if (r == -EBUSY) {
+ int ret = fs->mount();
+ if (ret < 0) {
+ if (ret == -EBUSY) {
cerr << "OSD has the store locked" << std::endl;
} else {
- cerr << "Mount failed with '" << cpp_strerror(-r) << "'" << std::endl;
+ cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl;
}
- return 1;
+ myexit(1);
}
bool fs_sharded_objects = fs->get_allow_sharded_objects();
- int ret = 0;
vector<coll_t> ls;
vector<coll_t>::iterator it;
CompatSet supported;
bufferlist bl;
OSDSuperblock superblock;
bufferlist::iterator p;
- r = fs->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
- if (r < 0) {
- cerr << "Failure to read OSD superblock error= " << r << std::endl;
+ ret = fs->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
+ if (ret < 0) {
+ cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl;
goto out;
}
CompatSet unsupported = supported.unsupported(superblock.compat_features);
cerr << "On-disk OSD incompatible features set "
<< unsupported << std::endl;
- ret = EINVAL;
+ ret = -EINVAL;
goto out;
}
- if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) {
- cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
- return 1;
- }
-
if (op != "list" && vm.count("object")) {
+ // Special case: Create pgmeta_oid if empty string specified
+ // This can't conflict with any actual object names.
+ if (object == "") {
+ ghobj = pgid.make_pgmeta_oid();
+ } else {
json_spirit::Value v;
try {
if (!json_spirit::read(object, v)) {
- lookup_ghobject lookup(object);
+ // Special: Need head/snapdir so set even if user didn't specify
+ if (vm.count("objcmd") && (objcmd == "remove-clone-metadata"))
+ head = true;
+ lookup_ghobject lookup(object, head);
if (action_on_all_objects(fs, lookup, debug)) {
throw std::runtime_error("Internal error");
} else {
if (lookup.size() != 1) {
stringstream ss;
if (lookup.size() == 0)
- ss << objcmd << ": " << cpp_strerror(ENOENT);
+ ss << "No object id '" << object << "' found or invalid JSON specified";
else
- ss << "expected a single object named '" << object
- << "' but got " << lookup.size() << " instead";
+ ss << "Found " << lookup.size() << " objects with id '" << object
+ << "', please use a JSON spec from --op list instead";
throw std::runtime_error(ss.str());
}
pair<coll_t, ghobject_t> found = lookup.pop();
} else {
stringstream ss;
if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) {
- ss << "object '" << object
- << "' must be a JSON array but is of type "
- << v.type() << " instead";
+ ss << "Without --pgid the object '" << object
+ << "' must be a JSON array";
throw std::runtime_error(ss.str());
}
if (v.type() == json_spirit::array_type) {
json_spirit::Array array = v.get_array();
+ if (array.size() != 2) {
+ ss << "Object '" << object
+ << "' must be a JSON array with 2 elements";
+ throw std::runtime_error(ss.str());
+ }
vector<json_spirit::Value>::iterator i = array.begin();
+ //if (i == array.end() || i->type() != json_spirit::str_type) {
if (i->type() != json_spirit::str_type) {
- ss << "object '" << object
- << "' must be a JSON array with the first element a string but "
- << "found type " << v.type() << " instead";
+ ss << "Object '" << object
+ << "' must be a JSON array with the first element a string";
throw std::runtime_error(ss.str());
}
string object_pgidstr = i->get_str();
- spg_t object_pgid;
- object_pgid.parse(object_pgidstr.c_str());
- if (pgidstr.length() > 0) {
- if (object_pgid != pgid) {
- ss << "object '" << object
- << "' has a pgid different from the --pgid="
- << pgidstr << " option";
- throw std::runtime_error(ss.str());
+ if (object_pgidstr != "meta") {
+ spg_t object_pgid;
+ object_pgid.parse(object_pgidstr.c_str());
+ if (pgidstr.length() > 0) {
+ if (object_pgid != pgid) {
+ ss << "object '" << object
+ << "' has a pgid different from the --pgid="
+ << pgidstr << " option";
+ throw std::runtime_error(ss.str());
+ }
+ } else {
+ pgidstr = object_pgidstr;
+ pgid = object_pgid;
}
- } else {
- pgidstr = object_pgidstr;
- pgid = object_pgid;
- }
+ } else {
+ pgidstr = object_pgidstr;
+ }
++i;
v = *i;
}
try {
ghobj.decode(v);
} catch (std::runtime_error& e) {
- ss << "Decode object json error: " << e.what();
+ ss << "Decode object JSON error: " << e.what();
throw std::runtime_error(ss.str());
}
- if ((uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
+ if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
cerr << "Object pool and pgid pool don't match" << std::endl;
ret = 1;
goto out;
ret = 1;
goto out;
}
+ }
}
- if (op != "list" && op != "import" && op != "list-lost" && op != "fix-lost"
- && op != "list-pgs" && op != "set-allow-sharded-objects" &&
- (pgidstr.length() == 0)) {
+ // The ops which require --pgid option are checked here and
+ // mentioned in the usage for --pgid.
+ if ((op == "info" || op == "log" || op == "remove" || op == "export"
+ || op == "rm-past-intervals" || op == "mark-complete") &&
+ pgidstr.length() == 0) {
cerr << "Must provide pgid" << std::endl;
usage(desc);
+ ret = 1;
+ goto out;
}
if (op == "set-allow-sharded-objects") {
goto out;
}
- superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
- ObjectStore::Transaction t;
- bl.clear();
- ::encode(superblock, bl);
- t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
- r = fs->apply_transaction(t);
- if (r < 0) {
- cerr << "Error writing OSD superblock: " << cpp_strerror(r) << std::endl;
- ret = 1;
- goto out;
- }
-
- fs->set_allow_sharded_objects();
+ if (!dry_run) {
+ superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ ObjectStore::Transaction t;
+ bl.clear();
+ ::encode(superblock, bl);
+ t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
+ ret = fs->apply_transaction(t);
+ if (ret < 0) {
+ cerr << "Error writing OSD superblock: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ fs->set_allow_sharded_objects();
+ }
cout << "Enabled on-disk sharded objects" << std::endl;
ret = 0;
cerr << "Found incomplete transition to sharded objects" << std::endl;
cerr << std::endl;
cerr << "Use --op set-allow-sharded-objects to repair" << std::endl;
- ret = EINVAL;
+ ret = -EINVAL;
goto out;
}
if (op == "import") {
try {
- ret = do_import(fs, superblock);
+ ret = do_import(fs, superblock, force, pgidstr);
}
catch (const buffer::error &e) {
cerr << "do_import threw exception error " << e.what() << std::endl;
- ret = EFAULT;
+ ret = -EFAULT;
}
- if (ret == EFAULT) {
+ if (ret == -EFAULT) {
cerr << "Corrupt input for import" << std::endl;
}
if (ret == 0)
cout << "Import successful" << std::endl;
goto out;
+ } else if (op == "dump-journal-mount") {
+ // Undocumented feature to dump journal with mounted fs
+ // This doesn't support the format option, but it uses the
+ // ObjectStore::dump_journal() and mounts to get replay to run.
+ ret = fs->dump_journal(cout);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl;
+ } else {
+ cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl;
+ }
+ }
+ goto out;
+ } else if (op == "get-osdmap") {
+ bufferlist bl;
+ OSDMap osdmap;
+ if (epoch == 0) {
+ epoch = superblock.current_epoch;
+ }
+ ret = get_osdmap(fs, epoch, osdmap, bl);
+ if (ret) {
+ cerr << "Failed to get osdmap#" << epoch << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = bl.write_fd(file_fd);
+ if (ret) {
+ cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+ } else {
+ cout << "osdmap#" << epoch << " exported." << std::endl;
+ }
+ goto out;
+ } else if (op == "set-osdmap") {
+ bufferlist bl;
+ ret = get_fd_data(file_fd, bl);
+ if (ret < 0) {
+ cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl;
+ } else {
+ ret = set_osdmap(fs, epoch, bl, force);
+ }
+ goto out;
+ } else if (op == "get-inc-osdmap") {
+ bufferlist bl;
+ if (epoch == 0) {
+ epoch = superblock.current_epoch;
+ }
+ ret = get_inc_osdmap(fs, epoch, bl);
+ if (ret < 0) {
+ cerr << "Failed to get incremental osdmap# " << epoch << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = bl.write_fd(file_fd);
+ if (ret) {
+ cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+ } else {
+ cout << "inc-osdmap#" << epoch << " exported." << std::endl;
+ }
+ goto out;
+ } else if (op == "set-inc-osdmap") {
+ bufferlist bl;
+ ret = get_fd_data(file_fd, bl);
+ if (ret < 0) {
+ cerr << "Failed to read incremental osdmap " << cpp_strerror(ret) << std::endl;
+ goto out;
+ } else {
+ ret = set_inc_osdmap(fs, epoch, bl, force);
+ }
+ goto out;
}
log_oid = OSD::make_pg_log_oid(pgid);
biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
if (op == "remove") {
- finish_remove_pgs(fs);
- int r = initiate_new_remove_pg(fs, pgid);
- if (r) {
+ ret = initiate_new_remove_pg(fs, pgid);
+ if (ret < 0) {
cerr << "PG '" << pgid << "' not found" << std::endl;
- ret = 1;
goto out;
}
- finish_remove_pgs(fs);
cout << "Remove successful" << std::endl;
goto out;
}
- if (op == "list-lost" || op == "fix-lost") {
+ if (op == "fix-lost") {
boost::scoped_ptr<action_on_object_t> action;
- if (op == "list-lost")
- action.reset(new do_list_lost());
- if (op == "fix-lost")
- action.reset(new do_fix_lost());
+ action.reset(new do_fix_lost());
if (pgidstr.length())
- ret = action_on_all_objects_in_pg(fs, coll_t(pgid), *action, debug);
+ ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), *action, debug);
else
ret = action_on_all_objects(fs, *action, debug);
goto out;
}
- // Special list handling. Treating pretty_format as human readable,
- // with one object per line and not an enclosing array.
- human_readable = ends_with(format, "-pretty");
- if (op == "list" && human_readable) {
- // Remove -pretty from end of format which we know is there
- format = format.substr(0, format.size() - strlen("-pretty"));
+ if (op == "list") {
+ ret = do_list(fs, pgidstr, object, formatter, debug, human_readable, head);
+ if (ret < 0) {
+ cerr << "do_list failed: " << cpp_strerror(ret) << std::endl;
+ }
+ goto out;
}
- formatter = Formatter::create(format);
- if (formatter == NULL) {
- cerr << "unrecognized format: " << format << std::endl;
- ret = 1;
+ if (op == "dump-super") {
+ formatter->open_object_section("superblock");
+ superblock.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
goto out;
}
- if (op == "list") {
- r = do_list(fs, pgidstr, object, formatter, debug, human_readable);
- if (r) {
- cerr << "do_list failed with " << r << std::endl;
- ret = 1;
+ if (op == "meta-list") {
+ ret = do_meta(fs, object, formatter, debug, human_readable);
+ if (ret < 0) {
+ cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl;
}
goto out;
}
- r = fs->list_collections(ls);
- if (r < 0) {
- cerr << "failed to list pgs: " << cpp_strerror(-r) << std::endl;
- ret = 1;
+ ret = fs->list_collections(ls);
+ if (ret < 0) {
+ cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl;
goto out;
}
snapid_t snap;
spg_t tmppgid;
+ if (pgidstr == "meta") {
+ if (it->to_str() == "meta")
+ break;
+ else
+ continue;
+ }
+
if (!it->is_pg(tmppgid, snap)) {
continue;
}
if (op != "list-pgs" && tmppgid != pgid) {
continue;
}
- if (snap != CEPH_NOSNAP && debug) {
- cout << "skipping snapped dir " << *it
+ if (snap != CEPH_NOSNAP) {
+ if (debug)
+ cerr << "skipping snapped dir " << *it
<< " (pg " << pgid << " snap " << snap << ")" << std::endl;
continue;
}
goto out;
}
+ // If not an object command nor any of the ops handled below, then output this usage
+ // before complaining about a bad pgid
+ if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
+ cerr << "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, "
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)"
+ << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
epoch_t map_epoch;
// The following code for export, info, log require omap or !skip-mount-omap
if (it != ls.end()) {
if (vm.count("objcmd")) {
ret = 0;
if (objcmd == "remove") {
- int r = do_remove_object(fs, coll, ghobj);
- if (r) {
- ret = 1;
- }
+ ret = do_remove_object(fs, coll, ghobj);
goto out;
} else if (objcmd == "list-attrs") {
- int r = do_list_attrs(fs, coll, ghobj);
- if (r) {
- ret = 1;
- }
+ ret = do_list_attrs(fs, coll, ghobj);
goto out;
} else if (objcmd == "list-omap") {
- int r = do_list_omap(fs, coll, ghobj);
- if (r) {
- ret = 1;
- }
+ ret = do_list_omap(fs, coll, ghobj);
goto out;
} else if (objcmd == "get-bytes" || objcmd == "set-bytes") {
- int r;
if (objcmd == "get-bytes") {
int fd;
if (vm.count("arg1") == 0 || arg1 == "-") {
goto out;
}
}
- r = do_get_bytes(fs, coll, ghobj, fd);
+ ret = do_get_bytes(fs, coll, ghobj, fd);
if (fd != STDOUT_FILENO)
close(fd);
} else {
goto out;
}
}
- r = do_set_bytes(fs, coll, ghobj, fd);
+ ret = do_set_bytes(fs, coll, ghobj, fd);
if (fd != STDIN_FILENO)
close(fd);
}
- if (r)
- ret = 1;
goto out;
} else if (objcmd == "get-attr") {
- if (vm.count("arg1") == 0)
+ if (vm.count("arg1") == 0) {
usage(desc);
- r = do_get_attr(fs, coll, ghobj, arg1);
- if (r)
- ret = 1;
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_attr(fs, coll, ghobj, arg1);
goto out;
} else if (objcmd == "set-attr") {
- if (vm.count("arg1") == 0)
+ if (vm.count("arg1") == 0) {
usage(desc);
+ ret = 1;
+ }
int fd;
if (vm.count("arg2") == 0 || arg2 == "-") {
goto out;
}
}
- r = do_set_attr(fs, coll, ghobj, arg1, fd);
+ ret = do_set_attr(fs, coll, ghobj, arg1, fd);
if (fd != STDIN_FILENO)
close(fd);
- if (r)
- ret = 1;
goto out;
} else if (objcmd == "rm-attr") {
- if (vm.count("arg1") == 0)
+ if (vm.count("arg1") == 0) {
usage(desc);
- r = do_rm_attr(fs, coll, ghobj, arg1);
- if (r)
- ret = 1;
+ ret = 1;
+ goto out;
+ }
+ ret = do_rm_attr(fs, coll, ghobj, arg1);
goto out;
} else if (objcmd == "get-omap") {
- if (vm.count("arg1") == 0)
+ if (vm.count("arg1") == 0) {
usage(desc);
- r = do_get_omap(fs, coll, ghobj, arg1);
- if (r)
- ret = 1;
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_omap(fs, coll, ghobj, arg1);
goto out;
} else if (objcmd == "set-omap") {
- if (vm.count("arg1") == 0)
+ if (vm.count("arg1") == 0) {
usage(desc);
-
+ ret = 1;
+ goto out;
+ }
int fd;
if (vm.count("arg2") == 0 || arg2 == "-") {
// Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
goto out;
}
}
- r = do_set_omap(fs, coll, ghobj, arg1, fd);
+ ret = do_set_omap(fs, coll, ghobj, arg1, fd);
if (fd != STDIN_FILENO)
close(fd);
- if (r)
- ret = 1;
goto out;
} else if (objcmd == "rm-omap") {
- if (vm.count("arg1") == 0)
+ if (vm.count("arg1") == 0) {
usage(desc);
- r = do_rm_omap(fs, coll, ghobj, arg1);
- if (r)
- ret = 1;
+ ret = 1;
+ goto out;
+ }
+ ret = do_rm_omap(fs, coll, ghobj, arg1);
goto out;
} else if (objcmd == "get-omaphdr") {
- if (vm.count("arg1"))
+ if (vm.count("arg1")) {
usage(desc);
- r = do_get_omaphdr(fs, coll, ghobj);
- if (r)
- ret = 1;
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_omaphdr(fs, coll, ghobj);
goto out;
} else if (objcmd == "set-omaphdr") {
// Extra arg
- if (vm.count("arg2"))
+ if (vm.count("arg2")) {
usage(desc);
+ ret = 1;
+ goto out;
+ }
int fd;
if (vm.count("arg1") == 0 || arg1 == "-") {
// Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
goto out;
}
}
- r = do_set_omaphdr(fs, coll, ghobj, fd);
+ ret = do_set_omaphdr(fs, coll, ghobj, fd);
if (fd != STDIN_FILENO)
close(fd);
- if (r)
+ goto out;
+ } else if (objcmd == "dump") {
+ // There should not be any other arguments
+ if (vm.count("arg1") || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = print_obj_info(fs, coll, ghobj, formatter);
+ goto out;
+ } else if (objcmd == "set-size") {
+ // Extra arg
+ if (vm.count("arg1") == 0 || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+ cerr << "Invalid size '" << arg1 << "' specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ uint64_t size = atoll(arg1.c_str());
+ ret = set_size(fs, coll, ghobj, size, formatter);
+ goto out;
+ } else if (objcmd == "clear-snapset") {
+ // UNDOCUMENTED: For testing zap SnapSet
+ // IGNORE extra args since not in usage anyway
+ if (!ghobj.hobj.has_snapset()) {
+ cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
ret = 1;
+ goto out;
+ }
+ ret = clear_snapset(fs, coll, ghobj, arg1);
goto out;
+ } else if (objcmd == "remove-clone-metadata") {
+ // Extra arg
+ if (vm.count("arg1") == 0 || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ if (!ghobj.hobj.has_snapset()) {
+ cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+ cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ snapid_t cloneid = atoi(arg1.c_str());
+ ret = remove_clone(fs, coll, ghobj, cloneid, force);
+ goto out;
}
cerr << "Unknown object command '" << objcmd << "'" << std::endl;
usage(desc);
+ ret = 1;
+ goto out;
}
bufferlist bl;
map_epoch = 0;
- r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
- if (r < 0)
+ ret = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
+ if (ret < 0)
cerr << "peek_map_epoch returns an error" << std::endl;
if (debug)
pg_info_t info(pgid);
map<epoch_t,pg_interval_t> past_intervals;
__u8 struct_ver;
- r = PG::read_info(fs, pgid, coll, bl, info, past_intervals,
+ ret = PG::read_info(fs, pgid, coll, bl, info, past_intervals,
struct_ver);
- if (r < 0) {
- cerr << "read_info error " << cpp_strerror(-r) << std::endl;
- ret = 1;
+ if (ret < 0) {
+ cerr << "read_info error " << cpp_strerror(ret) << std::endl;
goto out;
}
if (struct_ver < PG::compat_struct_v) {
cerr << "PG is too old to upgrade, use older Ceph version" << std::endl;
- ret = 1;
+ ret = -EFAULT;
goto out;
}
if (debug)
} else if (op == "log") {
PGLog::IndexedLog log;
pg_missing_t missing;
- ret = get_log(fs, struct_ver, coll, pgid, info, log, missing);
- if (ret > 0)
+ map<eversion_t, hobject_t> divergent_priors;
+ ret = get_log(fs, struct_ver, coll, pgid, info, log, missing,
+ divergent_priors);
+ if (ret < 0)
goto out;
- formatter->open_object_section("log");
- log.dump(formatter);
- formatter->close_section();
- formatter->flush(cout);
- cout << std::endl;
- formatter->open_object_section("missing");
- missing.dump(formatter);
- formatter->close_section();
- formatter->flush(cout);
- cout << std::endl;
+ dump_log(formatter, cout, log, missing, divergent_priors);
} else if (op == "rm-past-intervals") {
ObjectStore::Transaction tran;
ObjectStore::Transaction *t = &tran;
cerr << "Can't remove past-intervals, version mismatch " << (int)struct_ver
<< " (pg) != " << (int)PG::cur_struct_v << " (tool)"
<< std::endl;
- ret = 1;
+ ret = -EFAULT;
goto out;
}
cout << "Remove past-intervals " << past_intervals << std::endl;
past_intervals.clear();
+ if (dry_run) {
+ ret = 0;
+ goto out;
+ }
ret = write_info(*t, map_epoch, info, past_intervals);
if (ret == 0) {
fs->apply_transaction(*t);
cout << "Removal succeeded" << std::endl;
}
+ } else if (op == "mark-complete") {
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (struct_ver != PG::cur_struct_v) {
+ cerr << "Can't mark-complete, version mismatch " << (int)struct_ver
+ << " (pg) != " << (int)PG::cur_struct_v << " (tool)"
+ << std::endl;
+ ret = 1;
+ goto out;
+ }
+
+ cout << "Marking complete " << std::endl;
+
+ info.last_update = eversion_t(superblock.current_epoch, info.last_update.version + 1);
+ info.last_backfill = hobject_t::get_max();
+ info.last_epoch_started = superblock.current_epoch;
+ info.history.last_epoch_started = superblock.current_epoch;
+ info.history.last_epoch_clean = superblock.current_epoch;
+ past_intervals.clear();
+
+ if (!dry_run) {
+ ret = write_info(*t, map_epoch, info, past_intervals);
+ if (ret != 0)
+ goto out;
+ fs->apply_transaction(*t);
+ }
+ cout << "Marking complete succeeded" << std::endl;
} else {
- cerr << "Must provide --op (info, log, remove, export, import, list, list-lost, fix-lost, list-pgs, rm-past-intervals)"
- << std::endl;
- usage(desc);
+ assert(!"Should have already checked for valid --op");
}
} else {
cerr << "PG '" << pgid << "' not found" << std::endl;
- ret = 1;
+ ret = -ENOENT;
}
out:
- if (fs->umount() < 0) {
- cerr << "umount failed" << std::endl;
- return 1;
+ int r = fs->umount();
+ if (r < 0) {
+ cerr << "umount failed: " << cpp_strerror(r) << std::endl;
+ // If no previous error, then use umount() error
+ if (ret == 0)
+ ret = r;
+ }
+
+ if (dry_run) {
+ // Export output can go to stdout, so put this message on stderr
+ if (op == "export")
+ cerr << "dry-run: Nothing changed" << std::endl;
+ else
+ cout << "dry-run: Nothing changed" << std::endl;
}
- // Check for -errno accidentally getting here
if (ret < 0)
ret = 1;
- return ret;
+ myexit(ret);
}
}
}
if (tree) {
- ostringstream oss;
- crush.dump_tree(&oss, NULL);
- dout(1) << "\n" << oss.str() << dendl;
+ crush.dump_tree(&cout, NULL);
}
if (compile) {
" setomapheader <obj-name> <val>\n"
" tmap-to-omap <obj-name> convert tmap keys/values to omap\n"
" watch <obj-name> add watcher on this object\n"
-" notify <obj-name> <message> notify wather of this object with message\n"
+" notify <obj-name> <message> notify watcher of this object with message\n"
" listwatchers <obj-name> list the watchers of this object\n"
" set-alloc-hint <obj-name> <expected-object-size> <expected-write-size>\n"
" set allocation hint for an object\n"
}
if (values.size() && values.begin()->first == key) {
- cout << " (length " << values.begin()->second.length() << ") : ";
if (!outfile.empty()) {
cerr << "Writing to " << outfile << std::endl;
dump_data(outfile, values.begin()->second);
} else {
+ cout << "value (" << values.begin()->second.length() << " bytes) :\n";
values.begin()->second.hexdump(cout);
cout << std::endl;
}
// dump key in hex if it contains nonprintable characters
if (std::count_if(it->first.begin(), it->first.end(),
(int (*)(int))isprint) < (int)it->first.length()) {
- cout << "key: (" << it->first.length() << " bytes):\n";
+ cout << "key (" << it->first.length() << " bytes):\n";
bufferlist keybl;
keybl.append(it->first);
keybl.hexdump(cout);
cout << it->first;
}
cout << std::endl;
- cout << "value: (" << it->second.length() << " bytes) :\n";
+ cout << "value (" << it->second.length() << " bytes) :\n";
it->second.hexdump(cout);
cout << std::endl;
}
-ceph (0.94.6-1~u14.04+mos1) mos9.0; urgency=low
-
- * New upstream bugfix only release
- * Added patche which makes ceph-disk work with udev generated symlinks
- * Added a patch to fix rados bench crash
- * Adjust packaging:
- - disable make check, it fails anyway since some necessary files are
- missing in the upstream tarball
- - skip build dependencies necessary for tests only (valgrind, virtualenv)
- - don't install ceph-deploy manual page to avoid file conflicts with
- ceph-deploy package
-
- -- Alexey Sheplyakov <asheplyakov@mirantis.com> Wed, 24 Feb 2016 16:48:44 +0300
-
-ceph (0.94.5-0u~u14.04+mos1) mos8.0; urgency=medium
-
- * Rebuild for Ubuntu 14.04
-
- -- Alexey Sheplyakov <asheplyakov@mirantis.com> Thu, 12 Nov 2015 12:49:24 +0300
-
-ceph (0.94.5-0ubuntu1) xenial; urgency=medium
-
- * New upstream release (LP: #1512292):
- - d/p/*: Refresh.
- - d/p/ceph-radosgw-init.patch: Dropped, included upstream.
- - d/*.symbols: Refresh.
- * d/p/modules.patch: Add jerasure_neon and shec erasure coding plugins
- to generate unversioned so's for plugin loading (LP: #1507244).
- * d/rules: Ensure that any remaining versioned so's are dropped from
- the packaging - this is all test code (LP: #1507244).
-
- -- James Page <james.page@ubuntu.com> Mon, 02 Nov 2015 14:47:31 +0000
-
-ceph (0.94.3-0ubuntu2) wily; urgency=medium
-
- * d/ceph.install: Drop ceph-deploy manpage from packaging, provided
- by ceph-deploy itself (LP: #1475910).
-
- -- James Page <james.page@ubuntu.com> Mon, 07 Sep 2015 14:42:03 +0100
-
-ceph (0.94.3-0ubuntu1) wily; urgency=medium
-
- [ James Page ]
- * New upstream point release (LP: #1492227):
- - d/p/remove-unused-variable-ceph-bug-11576.patch:
- Dropped, included upstream.
- - d/p*: Refreshed.
-
- [ Liam Young ]
- * d/p/ceph-radosgw-init.patch: Cherry pick patch from upstream VCS to
- ensure that restarts of the radosgw wait an appropriate amount of time
- for the existing daemon to shutdown (LP: #1477225).
-
- -- James Page <james.page@ubuntu.com> Mon, 07 Sep 2015 12:23:50 +0100
-
-ceph (0.94.2-0ubuntu3) wily; urgency=medium
-
- * Fix compile failure with boost 1.58 (LP: #1483403):
- - src/mon/OSDMonitor.cc: remove unused variable (Ceph issue #11576)
-
- -- Tiago Stürmer Daitx <tiago.daitx@canonical.com> Mon, 10 Aug 2015 18:36:48 -0300
-
-ceph (0.94.2-0ubuntu2) wily; urgency=medium
-
- * No change rebuild for boost1.58/libstdc++6.
-
- -- Dimitri John Ledkov <dimitri.j.ledkov@linux.intel.com> Sun, 02 Aug 2015 13:25:26 +0100
-
-ceph (0.94.2-0ubuntu1) wily; urgency=medium
-
- * New upstream point release (LP: #1465553):
- - d/p/*: Refreshed.
-
- -- James Page <james.page@ubuntu.com> Tue, 16 Jun 2015 09:53:23 +0100
-
-ceph (0.94.1-0ubuntu1) vivid; urgency=high
-
- * New upstream stable point release (LP: #1443821):
- - Includes critical fix for communication from pre-0.94 clients
- during cluster upgrades.
-
- -- James Page <james.page@ubuntu.com> Tue, 14 Apr 2015 11:46:12 +0100
-
-ceph (0.94-0ubuntu1) vivid; urgency=low
-
- * New upstream stable release 'Hammer' (LP: #1423601):
- - d/p/*: Refresh.
+ceph (0.94.9-1~u14.04+mos1) mos9.0; urgency=low
+
+ * Package 0.94.9 upstream release, most notable bugfixes:
+ - monitor crashes on a command without a prefix,
+ http://tracker.ceph.com/issues/16297
+ https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2016-5009
+ - pool quota alarm is not in effect,
+ http://tracker.ceph.com/issues/15478
+ - OSD reports ENOTEMPTY and crashes,
+ http://tracker.ceph.com/issues/14766
+ - improved reweight-by-utilization to consider the least used OSDs first,
+ http://tracker.ceph.com/issues/15770
+ - no Last-Modified, Content-Size and X-Object-Manifest headers if no
+ segments in DLO manifest,
+ http://tracker.ceph.com/issues/15812
+ * Backported fix of "Data corruption using RBD with caching enabled",
+ http://tracker.ceph.com/issues/17545
+ * Make a mixed 0.94.{6,9} cluster work properly.
+ BIG RED WARNING:
+ as a result mixing MOS OSDs with upstream ones version 0.94.7 and
+ newer will break your cluster. You have been warned.
+ * debian/patches:
+ - drop hammer-rbd-snap-rollback-restore-the-link-to-parent.patch,
+ included upstream
+ - drop patches/rgw-handle-errors-properly-during-GET-on-Swift-s-DLO.patch,
+ included upstream
+ - added Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch so
+ a mixed cluster consisting of 0.94.6 and 0.94.9 OSDs/monitors can
+ can work properly.
+ * Added missing build dependency on libboost-random-dev
+
+ -- Alexey Sheplyakov <asheplyakov@mirantis.com> Thu, 06 Oct 2016 19:50:27 +0300
+
+ceph (0.94.7-1) experimental; urgency=low
+
+ * Package 0.94.7 upstream release, most notable changes:
+ - librbd: possible QEMU deadlock after creating image snapshots,
+ http://tracker.ceph.com/issues/14988
+ - librbd: flattening an rbd image with active IO can lead to hang,
+ http://tracker.ceph.com/issues/14092
+ - osd: fixed corruption when min_read_recency_for_promote > 1,
+ http://tracker.ceph.com/issues/15171
+ - mon: implement reweight-by-utilization feature,
+ https://github.com/ceph/ceph/pull/8026
+ See http://ceph.com/releases/v0-94-7-hammer-released for more details
+ * debian/rules: set consistent hardening flags for compiler and linker
+ * debian/ceph.install: skip ceph-deploy.8 to avoid file conflict with
+ ceph-deploy
+ * debian/patches:
+ - drop ObjBencher-seq_read_bench-fix-locking-errors.patch,
+ included upstream
+ - keep ceph-disk-fix-symlinks-handling.patch, the problem won't be
+ fixed in upstream 0.94.x
+ - add rbd #14512 fix (data loss on clone, snapshot, rollback)
+ - added rgw #15812 fix (`No Last-Modified, Content-Size and
+ X-Object-Manifest headers if no segments in DLO manifest')
+
+ -- Alexey Sheplyakov <asheplyakov@mirantis.com> Wed, 18 May 2016 17:05:28 +0300
+
+ceph (0.94.6-1) experimental; urgency=medium
+
+ * New upstream bugfix release
+ * Refresh "rbd lazy umount before unmap" patch
+ * Added upstream patches which make ceph-disk work with udev
+ generated symlinks
+ * Added a patch to make rados bench usable
+
+ -- Alexey Sheplyakov <asheplyakov@mirantis.com> Mon, 29 Feb 2016 10:22:57 +0300
+
+ceph (0.94.5-1) experimental; urgency=medium
+
+ * [2d330d6] New upstream release:
+ - [1e93090] Drop patch for CVE-2015-5245, included upstream.
+ - [20adc7d] Refresh all other patches.
+ * [9255e5d] Ensure any erasure coding test libraries and dangling symlinks
+ are not included in the ceph package.
+
+ -- James Page <james.page@ubuntu.com> Mon, 09 Nov 2015 12:09:51 +0000
+
+ceph (0.94.3-1) experimental; urgency=medium
+
+ * [580fef] Imported Upstream version 0.94.3 (Closes: #777814, #795178)
+ * [536935] Add upstream patch to fix CVE-2015-5245 (Closes: #798567)
+
+ -- Gaudenz Steinlin <gaudenz@debian.org> Fri, 18 Sep 2015 16:55:23 +0200
+
+ceph (0.94.2-2) experimental; urgency=medium
+
+ * Revert "Drop virtualenv BD, disable unit tests."
+ * Restore patches for test enablement.
+ * Display test-suite log output in the event of failures.
+
+ -- James Page <james.page@ubuntu.com> Mon, 20 Jul 2015 13:37:06 +0100
+
+ceph (0.94.2-1) experimental; urgency=medium
+
+ * Resync with Ubuntu, introducing Ceph Hammer stable release:
- d/*.symbols: Update inline with upstream additions, use regex
for ceph version symbol.
-
- -- James Page <james.page@ubuntu.com> Wed, 08 Apr 2015 18:57:08 +0100
-
-ceph (0.93-0ubuntu6) vivid; urgency=medium
-
- * d/control,rules,*.symbols: Disable lttng support until we can make
- it play a bit nicer with libvirt and apparmor, drop associated
- symbols (LP: #1432644).
-
- -- James Page <james.page@ubuntu.com> Wed, 01 Apr 2015 10:37:03 +0100
-
-ceph (0.93-0ubuntu5) vivid; urgency=medium
-
- * d/lib-systemd/system/ceph-create-keys.service: Automatically create
- admin and bootstrap keys after ceph mon startup (LP: #1435450).
- * d/p/vivid-does-systemd.patch: Ensure that disks prepared on vivid
- or later use systemd for init (LP: #1435464).
- * d/lib-systemd/system/*.service: Align nofile limits and restart config
- with equivalent upstart configurations.
-
- -- James Page <james.page@ubuntu.com> Tue, 24 Mar 2015 12:30:14 +0000
-
-ceph (0.93-0ubuntu4) vivid; urgency=medium
-
- * d/p/fix-cycles-arch.patch: Skip initialization of cycles_per_sec
- if rtdsc (or equivalent) is not supported (LP: #1432786).
-
- -- James Page <james.page@ubuntu.com> Wed, 18 Mar 2015 14:44:39 +0000
-
-ceph (0.93-0ubuntu3) vivid; urgency=medium
-
- * d/ceph{-common}.install,control: Move ceph_argparse.py down into
- ceph-common package to fixup ceph cli usage/autopkgtest failure.
-
- -- James Page <james.page@ubuntu.com> Sat, 14 Mar 2015 21:27:26 +0000
-
-ceph (0.93-0ubuntu2) vivid; urgency=medium
-
- * d/p/fix-cycles-arch.patch: Expand highres cycles support to cover
- PPC architectures, warn and default to return 0 for archs without
- support, fixing FTBFS.
-
- -- James Page <james.page@ubuntu.com> Fri, 13 Mar 2015 19:40:03 +0000
-
-ceph (0.93-0ubuntu1) vivid; urgency=medium
-
- * New upstream release candidate for Hammer stable release (LP: #1423601).
- - d/*.symbols: Refresh inline with upstream, removing common code
- symbols which don't form part of the public API.
- - d/p/*: Refresh and drop patches as required.
- * Resync with upstream packaging changes and enable new features:
+ - d/lib-systemd/system/ceph-create-keys.service: Automatically create
+ admin and bootstrap keys after ceph mon startup.
+ - d/p/vivid-does-systemd.patch: Ensure that disks prepared on vivid
+ or later use systemd for init.
+ - d/lib-systemd/system/*.service: Align nofile limits and restart config
+ with equivalent upstart configurations.
+ - d/p/fix-cycles-arch.patch: Skip initialization of cycles_per_sec
+ if rtdsc (or equivalent) is not supported.
+ - d/ceph{-common}.install,control: Move ceph_argparse.py down into
+ ceph-common package to fixup ceph cli usage/autopkgtest failure.
- d/control,ceph-common.install,librbd1.install: Move rbdnamer and
associated udev rules into ceph-common package.
- d/control,python-*: Split out rbd, rados and cephfs bindings into
- d/control: Move python-flask dependency to ceph package, only required
for REST API.
- d/control: Use google-perftools on arm64.
- - d/rules,control: Enable use of lttng for userspace tracing.
-
- -- James Page <james.page@ubuntu.com> Fri, 13 Mar 2015 07:42:45 +0000
-
-ceph (0.87-0ubuntu5) vivid; urgency=medium
-
- * d/p/fix-argparse-defaults.patch: Workaround behavioural change in
- argparse set_defaults in python 2.7.9 (LP: #1413321).
- * d/rules: Disable build and support for RocksDB over concerns around
- performance > 1TB in size.
-
- -- James Page <james.page@ubuntu.com> Thu, 22 Jan 2015 09:54:19 +0000
-
-ceph (0.87-0ubuntu4) vivid; urgency=medium
-
- * d/p/ceph-osd-prestart-path.patch: Fixup path for ceph-osd upstart
- configuration pre-start script.
-
- -- James Page <james.page@ubuntu.com> Tue, 13 Jan 2015 12:33:49 +0000
-
-ceph (0.87-0ubuntu3) vivid; urgency=medium
-
- * d/control: Re-order Recommends to prefer ntp over chrony for Ubuntu.
+ - d/control: Re-order Recommends to prefer ntp over chrony for Ubuntu.
+ - d/p/ceph-osd-prestart-path.patch: Fixup path for ceph-osd upstart
+ configuration pre-start script.
+ - d/p/fix-argparse-defaults.patch: Workaround behavioural change in
+ argparse set_defaults in python 2.7.9
+ * New upstream point release:
+ - d/p/*: Refresh.
+ * d/p/use_system_jerasure.patch,d/control: Drop use of libjerasure
+ as the patch is intrusive and expensive to maintain; will revisit if
+ adopted upstream.
- -- James Page <james.page@ubuntu.com> Tue, 16 Dec 2014 14:59:31 +0000
+ -- James Page <james.page@ubuntu.com> Tue, 16 Jun 2015 11:31:05 +0100
-ceph (0.87-0ubuntu2) vivid; urgency=medium
+ceph (0.87-2) experimental; urgency=low
- * d/rules: Limit rocksdb support to x86 + armhf, fixing FTBFS on
- unsupported and broken architectures.
+ * Team upload.
- -- James Page <james.page@ubuntu.com> Mon, 08 Dec 2014 12:36:51 +0000
+ [ Gaudenz Steinlin ]
+ * README.Debian: added clarification about setting the hashpspool flag.
+ (Closes: #769596).
-ceph (0.87-0ubuntu1) vivid; urgency=medium
+ [ James Page ]
+ * Added new "modules.patch" to mark new erasure coding libraries as
+ modules, wildcard install.
[ Dmitry Smirnov ]
+ * Recommends: added "ntp" to list of time-daemon alternatives
+ (Closes: #767511).
+ * Introduced native systemd services (except "rbdmap"), (Closes: #769593).
+ * ceph-test: install forgotten files.
+ * Run post-build tests:
+ + updated "virtualenv-never-download.patch" to pass
+ "--system-site-packages" to virtualenv to prevent downloads.
+ + added new patches to disable network-dependent and failing tests.
+ * Patchworks:
+ - bug-9341.patch
+ + bug-10036.patch (to show OSD affinity in "ceph osd tree").
+ Thanks, Mykola Golub.
+ + bug-10059.patch
+ + 0latest-giant.patch (Last-Update: 2014-11-15).
+ + sleep-recover.patch
+ + tests-disable.patch (to disable tests that need cluster).
+ + tests-disable-ceph-disk.patch
+ + use_system_gtest.patch (commented)
+ as first attempt to build with system "libgtest-dev".
+ + use_system_jerasure.patch
+ * Build-Depends:
+ + libjerasure-dev (>= 2.0.0-2~)
+ + virtualenv
+ + valgrind [amd64 armhf i386 powerpc]
+ * rules: pass "--without-lttng" to explicitly disable "lttng" to avoid
+ auto-enable if found.
+ * rules: disabled bundled RocksDB:
+ RocksDB suppose to improve performance of keyvaluestore OSDs but the
+ latter slow down to nearly unusable state when filled over 1 TiB even with
+ RocksDB. Moreover KV backend is experimental and super dangerous -- I lost
+ cluster due to OSD poisoning caused by KV OSD which was plugged only
+ during limited time. LevelDB is good enough, for now I see no reason to
+ use RocksDB especially considering that it is not packaged separately.
+ * Removed myself from Uploaders.
+
+ -- Dmitry Smirnov <onlyjob@debian.org> Wed, 01 Apr 2015 11:47:38 +1100
+
+ceph (0.87-1) experimental; urgency=medium
+
* New major upstream release [October 2014].
+ new "libradosstriper*" binary packages.
* Patchworks (removed old patches, refreshed remaining ones).
* Build with "--with-babeltrace".
* Build and statically link bundled RocksDB.
- [ James Page ]
- * d/control,rules: Disable test suite execution and drop BD's on
- virtualenv and valgrind for Ubuntu.
- * d/p/modules.patch,d/ceph.install: Mark new erasure coding libraries
- as modules, install via wildcard.
+ -- Dmitry Smirnov <onlyjob@debian.org> Thu, 30 Oct 2014 12:43:49 +1100
+
+ceph (0.80.9-2) unstable; urgency=medium
+
+ * [70fc1d] Add NEWS entry about CRUSH issues fixed in 0.80.9
+ * [f41bb6] Add NEWS entry about rbd backed filesystems and systemd
+
+ -- Gaudenz Steinlin <gaudenz@debian.org> Tue, 05 May 2015 21:29:15 +0200
+
+ceph (0.80.9-1) unstable; urgency=medium
+
+ * [4b4e] Imported Upstream version 0.80.9
+ * [7102] Remove patches firefly-latest and p2139 applied upstream
+ * [5869] Add myself to uploaders
+
+ -- Gaudenz Steinlin <gaudenz@debian.org> Mon, 04 May 2015 08:49:37 +0200
+
+ceph (0.80.7-2) unstable; urgency=medium
+
+ * Team upload.
+ * Build-Depends: +libjerasure-dev (>= 2.0.0-2~)
+ * New patch to use system "jerasure" library instead of its bundled copy.
+ * Removed myself from Uploaders.
- -- James Page <james.page@ubuntu.com> Fri, 05 Dec 2014 13:40:46 +0000
+ -- Dmitry Smirnov <onlyjob@debian.org> Thu, 11 Dec 2014 12:55:38 +1100
ceph (0.80.7-1) unstable; urgency=medium
Source: ceph
Section: admin
Priority: optional
-Maintainer: MOS ceph team <mos-ceph@mirantis.com>
-XSBC-Original-Maintainer: Ceph Maintainer <ceph-maintainers@list.ceph.com>
+Maintainer: Ceph Maintainers <ceph-maintainers@lists.ceph.com>
+Uploaders: Laszlo Boszormenyi (GCS) <gcs@debian.org>,
+ James Page <jamespage@debian.org>
Homepage: http://ceph.com/
+Vcs-Git: git://anonscm.debian.org/pkg-ceph/ceph.git
+Vcs-Browser: http://anonscm.debian.org/gitweb/?p=pkg-ceph/ceph.git
Build-Depends: debhelper (>= 9~),
default-jdk,
dh-autoreconf,
libboost-program-options-dev (>= 1.54),
libboost-system-dev (>= 1.54),
libboost-thread-dev (>= 1.54),
+ libboost-random-dev (>= 1.54),
libbabeltrace-ctf-dev,
libbabeltrace-dev,
libbz2-dev,
upstream-branch = upstream-hammer
pristine-tar = True
+[pq]
+patch-numbers = False
+
[import-orig]
filter = debian/*
--- /dev/null
+#!/bin/sh
+set -e
+# MOS packaging CI insists on keeping the source under $pkgname directory,
+# and debianization files in debian. Moving around files manually is a bit
+# error prone (it's easy to forget 'git add something'), hence this script.
+MYDIR="${0%/*}"
+cd ${MYDIR}/..
+
+mkdir -p -m 755 ceph
+git ls-files | grep -vE '^debian[/]' | xargs cp -a --parents --target-directory=ceph
+git add ceph
+git ls-files | grep -vE '^(debian|ceph)[/]' | xargs git rm -f --
+git commit -m 'Shuffle files for MOS CI'
+
+++ /dev/null
-From: Alexey Sheplyakov <asheplyakov@mirantis.com>
-Date: Fri, 26 Feb 2016 15:01:11 +0300
-Subject: ObjBencher::seq_read_bench: fix locking errors
-
-- take a lock before completion_ret
-- remove extraneous comparison: it's clearly misplaced (bad merge?)
- and tries to unlock a Mutex twice in a row
-
-Fixes: #14873
-
-Signed-off-by: Alexey Sheplyakov <asheplyakov@mirantis.com>
----
- src/common/obj_bencher.cc | 11 ++---------
- 1 file changed, 2 insertions(+), 9 deletions(-)
-
-diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
-index db4fd8f..a196e83 100644
---- a/src/common/obj_bencher.cc
-+++ b/src/common/obj_bencher.cc
-@@ -598,13 +598,13 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
- index[slot] = data.started;
- lock.Unlock();
- completion_wait(slot);
-+ lock.Lock();
- r = completion_ret(slot);
- if (r < 0) {
- cerr << "read got " << r << std::endl;
- lock.Unlock();
- goto ERR;
- }
-- lock.Lock();
- total_latency += data.cur_latency;
- if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
- if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
-@@ -624,14 +624,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
- lock.Lock();
- ++data.started;
- ++data.in_flight;
-- lock.Unlock();
-- if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
-- cerr << name[slot] << " is not correct!" << std::endl;
-- ++errors;
-- } else {
-- lock.Unlock();
-- }
--
-+ lock.Unlock();
- name[slot] = newName;
- }
-
--- /dev/null
+From: Greg Farnum <gfarnum@redhat.com>
+Date: Mon, 23 May 2016 15:14:21 -0700
+Subject: ObjectCacher: fix bh_read_finish offset logic
+
+If we have an incoming read split across multiple BufferHeads, we want to
+line up the BufferHead's bl with the incoming OSDOp's bl at the right offset. We
+were erroneously using this nonsense calculation (always equal to zero!) when
+a much simpler comparison of the BufferHead's logical object offset to the
+incoming OSDOp's logical offset will do the trick nicely.
+
+Fixes: http://tracker.ceph.com/issues/16002
+
+Signed-off-by: Greg Farnum <gfarnum@redhat.com>
+(cherry picked from commit 9ec6e7f608608088d51e449c9d375844631dcdde)
+---
+ src/osdc/ObjectCacher.cc | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
+index b2c2572..cad168c 100644
+--- a/src/osdc/ObjectCacher.cc
++++ b/src/osdc/ObjectCacher.cc
+@@ -787,7 +787,6 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid,
+ if (bh->error < 0)
+ err = bh->error;
+
+- loff_t oldpos = opos;
+ opos = bh->end();
+
+ if (r == -ENOENT) {
+@@ -807,7 +806,7 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid,
+ mark_error(bh);
+ } else {
+ bh->bl.substr_of(bl,
+- oldpos-bh->start(),
++ bh->start() - start,
+ bh->length());
+ mark_clean(bh);
+ }
--- /dev/null
+From: Alexey Sheplyakov <asheplyakov@mirantis.com>
+Date: Thu, 6 Oct 2016 19:13:00 +0300
+Subject: Remove HITSET_GMT related code so 0.94.{6,9} OSDs/mons can coexist
+
+Revert
+ - "osd: do not let OSD_HITSET_GMT reuse the feature bit"
+ - "osd/osd_types: encode pg_pool_t the old way"
+ - "osd: Decode use_gmt_hitset with a unique version"
+ - "mon: disable gmt_hitset if not supported"
+ - "mon: print use_gmt_hitset in "ceph osd pool get"
+ - "mon: add "ceph osd pool set $pool use_gmt_hitset true" cmd"
+ - "osd: use GMT time for the object name of hitsets"
+
+This reverts commits
+ - 7aec079f8a1bbe75625c438a17bb87e45398568e
+ - f8d2abd2e41c5dd04977f85cc1d6e65853c9a1b2
+ - 370434136ef076c350db3db4fca6489f88f70453
+ - 720a090eb67b3955b0cadb7633c5a28a934171a4
+ - 64bca2a43b34b265621bad2ec1fb980217223847
+ - 87df212cfca33efbbee6376f528cb7d4895d1dc0
+ - 039240418060c9a49298dacc0478772334526dce
+
+Required to allow 0.94.6 OSDs and monitors to inter-operate with 0.94.9 ones.
+
+The commit 039240418060c9a49298dacc0478772334526dce which fixes bug #9732
+breaks upgrade from 0.94.6 (which is shipped with MOS 8.x and 9.[01]) to
+newer versions, see http://tracker.ceph.com/issues/17386 for more details.
+Since MOS does not use cache pools and having a wrong time zone would
+cause multiple problems revert the above mentioned commit (along with
+the ones trying to address the breakage it causes) so a mixed cluster
+can work properly.
+---
+ src/common/config_opts.h | 1 -
+ src/include/ceph_features.h | 2 --
+ src/mon/MonCommands.h | 2 +-
+ src/mon/OSDMonitor.cc | 38 --------------------
+ src/osd/ReplicatedPG.cc | 27 ++++++--------
+ src/osd/ReplicatedPG.h | 4 +--
+ src/osd/osd_types.cc | 85 +++------------------------------------------
+ src/osd/osd_types.h | 11 +++---
+ 8 files changed, 20 insertions(+), 150 deletions(-)
+
+diff --git a/src/common/config_opts.h b/src/common/config_opts.h
+index c55694e..e773300 100644
+--- a/src/common/config_opts.h
++++ b/src/common/config_opts.h
+@@ -500,7 +500,6 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages
+ OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
+ OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
+ OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+-OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
+ OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
+ OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
+ OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
+diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
+index 205e18f..781df1b 100644
+--- a/src/include/ceph_features.h
++++ b/src/include/ceph_features.h
+@@ -64,7 +64,6 @@
+ // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
+ #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
+ #define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+-#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
+ /* ... */
+ #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
+
+@@ -152,7 +151,6 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
+ CEPH_FEATURE_MDS_QUOTA | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \
+- CEPH_FEATURE_OSD_HITSET_GMT | \
+ CEPH_FEATURE_HAMMER_0_94_4 | \
+ 0ULL)
+
+diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
+index 3f0dae8..a66cc55 100644
+--- a/src/mon/MonCommands.h
++++ b/src/mon/MonCommands.h
+@@ -634,7 +634,7 @@ COMMAND("osd pool get " \
+ "get pool parameter <var>", "osd", "r", "cli,rest")
+ COMMAND("osd pool set " \
+ "name=pool,type=CephPoolname " \
+- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
++ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
+ "name=val,type=CephString " \
+ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
+ "set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
+diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
+index a006dbd..afaea9f 100644
+--- a/src/mon/OSDMonitor.cc
++++ b/src/mon/OSDMonitor.cc
+@@ -16,7 +16,6 @@
+ *
+ */
+
+-#include <algorithm>
+ #include <sstream>
+
+ #include "OSDMonitor.h"
+@@ -1648,9 +1647,6 @@ void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
+ failure_info.clear();
+ }
+
+-static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
+- return pool.second.use_gmt_hitset;
+-}
+
+ // boot --
+
+@@ -1720,19 +1716,6 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
+ }
+ }
+
+- if (std::find_if(osdmap.get_pools().begin(),
+- osdmap.get_pools().end(),
+- uses_gmt_hitset) != osdmap.get_pools().end()) {
+- assert(osdmap.get_num_up_osds() == 0 ||
+- osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+- if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+- dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+- << m->get_orig_source_inst()
+- << " doesn't announce support -- ignore" << dendl;
+- goto ignore;
+- }
+- }
+-
+ // already booted?
+ if (osdmap.is_up(from) &&
+ osdmap.get_inst(from) == m->get_orig_source_inst()) {
+@@ -3174,7 +3157,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
+ if (!p->is_tier() &&
+ (var == "hit_set_type" || var == "hit_set_period" ||
+ var == "hit_set_count" || var == "hit_set_fpp" ||
+- var == "use_gmt_hitset" ||
+ var == "target_max_objects" || var == "target_max_bytes" ||
+ var == "cache_target_full_ratio" ||
+ var == "cache_target_dirty_ratio" ||
+@@ -3227,8 +3209,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ f->dump_float("hit_set_fpp", bloomp->get_fpp());
+ }
+- } else if (var == "use_gmt_hitset") {
+- f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
+ } else if (var == "target_max_objects") {
+ f->dump_unsigned("target_max_objects", p->target_max_objects);
+ } else if (var == "target_max_bytes") {
+@@ -3286,8 +3266,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
+ }
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ ss << "hit_set_fpp: " << bloomp->get_fpp();
+- } else if (var == "use_gmt_hitset") {
+- ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
+ } else if (var == "target_max_objects") {
+ ss << "target_max_objects: " << p->target_max_objects;
+ } else if (var == "target_max_bytes") {
+@@ -4170,11 +4148,6 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
+ pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+ if (g_conf->osd_pool_default_flag_nosizechange)
+ pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+- if (g_conf->osd_pool_use_gmt_hitset &&
+- (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+- pi->use_gmt_hitset = true;
+- else
+- pi->use_gmt_hitset = false;
+
+ pi->size = size;
+ pi->min_size = min_size;
+@@ -4518,17 +4491,6 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
+ }
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
+ bloomp->set_fpp(f);
+- } else if (var == "use_gmt_hitset") {
+- if (val == "true" || (interr.empty() && n == 1)) {
+- if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
+- ss << "not all OSDs support GMT hit set.";
+- return -EINVAL;
+- }
+- p.use_gmt_hitset = true;
+- } else {
+- ss << "expecting value 'true' or '1'";
+- return -EINVAL;
+- }
+ } else if (var == "debug_fake_ec_pool") {
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL;
+diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
+index 1676a3e..429f9de 100644
+--- a/src/osd/ReplicatedPG.cc
++++ b/src/osd/ReplicatedPG.cc
+@@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
+ p != info.hit_set.history.end();
+ ++p) {
+ if (stamp >= p->begin && stamp <= p->end) {
+- oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
++ oid = get_hit_set_archive_object(p->begin, p->end);
+ break;
+ }
+ }
+@@ -10177,19 +10177,10 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
+ return hoid;
+ }
+
+-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+- utime_t end,
+- bool using_gmt)
++hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+ {
+ ostringstream ss;
+- ss << "hit_set_" << info.pgid.pgid << "_archive_";
+- if (using_gmt) {
+- start.gmtime(ss) << "_";
+- end.gmtime(ss);
+- } else {
+- start.localtime(ss) << "_";
+- end.localtime(ss);
+- }
++ ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+ hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
+ info.pgid.ps(), info.pgid.pool(),
+ cct->_conf->osd_hit_set_namespace);
+@@ -10326,7 +10317,7 @@ void ReplicatedPG::hit_set_persist()
+ for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
+ p != info.hit_set.history.end();
+ ++p) {
+- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
++ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+
+ // Once we hit a degraded object just skip further trim
+ if (is_degraded_or_backfilling_object(aoid))
+@@ -10335,8 +10326,10 @@ void ReplicatedPG::hit_set_persist()
+ return;
+ }
+
+- oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
++ oid = get_hit_set_archive_object(start, now);
+ // If the current object is degraded we skip this persist request
++ if (is_degraded_or_backfilling_object(oid))
++ return;
+ if (scrubber.write_blocked_by_scrub(oid))
+ return;
+
+@@ -10427,7 +10420,7 @@ void ReplicatedPG::hit_set_persist()
+
+ updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
+ hit_set_create();
+- updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
++ updated_hit_set_hist.current_info = pg_hit_set_info_t();
+ updated_hit_set_hist.current_last_stamp = utime_t();
+
+ // fabricate an object_info_t and SnapSet
+@@ -10490,7 +10483,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
+ for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
+ list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
+ assert(p != updated_hit_set_hist.history.end());
+- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
++ hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+
+ assert(!is_degraded_or_backfilling_object(oid));
+
+@@ -10775,7 +10768,7 @@ void ReplicatedPG::agent_load_hit_sets()
+ continue;
+ }
+
+- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
++ hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ if (is_unreadable_object(oid)) {
+ dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
+ break;
+diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
+index 0894be6..c8ed4fc 100644
+--- a/src/osd/ReplicatedPG.h
++++ b/src/osd/ReplicatedPG.h
+@@ -903,9 +903,7 @@ protected:
+ void hit_set_in_memory_trim(); ///< discard old in memory HitSets
+
+ hobject_t get_hit_set_current_object(utime_t stamp);
+- hobject_t get_hit_set_archive_object(utime_t start,
+- utime_t end,
+- bool using_gmt);
++ hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+
+ // agent
+ boost::scoped_ptr<TierAgentState> agent_state;
+diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
+index b13925c..f0126bc 100644
+--- a/src/osd/osd_types.cc
++++ b/src/osd/osd_types.cc
+@@ -926,7 +926,6 @@ void pg_pool_t::dump(Formatter *f) const
+ f->close_section(); // hit_set_params
+ f->dump_unsigned("hit_set_period", hit_set_period);
+ f->dump_unsigned("hit_set_count", hit_set_count);
+- f->dump_bool("use_gmt_hitset", use_gmt_hitset);
+ f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
+ f->dump_unsigned("stripe_width", get_stripe_width());
+ f->dump_unsigned("expected_num_objects", expected_num_objects);
+@@ -1239,60 +1238,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
+ return;
+ }
+
+- if ((features & CEPH_FEATURE_OSD_HITSET_GMT) == 0) {
+- // CEPH_FEATURE_OSD_HITSET_GMT requires pg_pool_t v21 which has
+- // use_gmt_hitset, and two fields added before v21. it's backward
+- // compatible, but re-encoding the same osdmap with different ceph
+- // versions causes CRC mismatch at the OSD side, the tracker#12410
+- // prevents the monitor from sending the single full map requested
+- // by OSD. so we need a way to encode pg_pool_t the same old way.
+- ENCODE_START(17, 5, bl);
+- ::encode(type, bl);
+- ::encode(size, bl);
+- ::encode(crush_ruleset, bl);
+- ::encode(object_hash, bl);
+- ::encode(pg_num, bl);
+- ::encode(pgp_num, bl);
+- __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+- ::encode(lpg_num, bl);
+- ::encode(lpgp_num, bl);
+- ::encode(last_change, bl);
+- ::encode(snap_seq, bl);
+- ::encode(snap_epoch, bl);
+- ::encode(snaps, bl, features);
+- ::encode(removed_snaps, bl);
+- ::encode(auid, bl);
+- ::encode(flags, bl);
+- ::encode(crash_replay_interval, bl);
+- ::encode(min_size, bl);
+- ::encode(quota_max_bytes, bl);
+- ::encode(quota_max_objects, bl);
+- ::encode(tiers, bl);
+- ::encode(tier_of, bl);
+- __u8 c = cache_mode;
+- ::encode(c, bl);
+- ::encode(read_tier, bl);
+- ::encode(write_tier, bl);
+- ::encode(properties, bl);
+- ::encode(hit_set_params, bl);
+- ::encode(hit_set_period, bl);
+- ::encode(hit_set_count, bl);
+- ::encode(stripe_width, bl);
+- ::encode(target_max_bytes, bl);
+- ::encode(target_max_objects, bl);
+- ::encode(cache_target_dirty_ratio_micro, bl);
+- ::encode(cache_target_full_ratio_micro, bl);
+- ::encode(cache_min_flush_age, bl);
+- ::encode(cache_min_evict_age, bl);
+- ::encode(erasure_code_profile, bl);
+- ::encode(last_force_op_resend, bl);
+- ::encode(min_read_recency_for_promote, bl);
+- ::encode(expected_num_objects, bl);
+- ENCODE_FINISH(bl);
+- return;
+- }
+-
+- ENCODE_START(21, 5, bl);
++ ENCODE_START(17, 5, bl);
+ ::encode(type, bl);
+ ::encode(size, bl);
+ ::encode(crush_ruleset, bl);
+@@ -1334,15 +1280,12 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
+ ::encode(last_force_op_resend, bl);
+ ::encode(min_read_recency_for_promote, bl);
+ ::encode(expected_num_objects, bl);
+- ::encode(uint32_t(.6 * 1e6), bl);
+- ::encode(uint32_t(1), bl);
+- ::encode(use_gmt_hitset, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void pg_pool_t::decode(bufferlist::iterator& bl)
+ {
+- DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl);
++ DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl);
+ ::decode(type, bl);
+ ::decode(size, bl);
+ ::decode(crush_ruleset, bl);
+@@ -1454,19 +1397,6 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
+ } else {
+ expected_num_objects = 0;
+ }
+- if (struct_v >= 19) {
+- uint32_t dummy;
+- ::decode(dummy, bl);
+- }
+- if (struct_v >= 20) {
+- uint32_t dummy;
+- ::decode(dummy, bl);
+- }
+- if (struct_v >= 21) {
+- ::decode(use_gmt_hitset, bl);
+- } else {
+- use_gmt_hitset = false;
+- }
+ DECODE_FINISH(bl);
+ calc_pg_masks();
+ }
+@@ -3866,25 +3796,19 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
+
+ void pg_hit_set_info_t::encode(bufferlist& bl) const
+ {
+- ENCODE_START(2, 1, bl);
++ ENCODE_START(1, 1, bl);
+ ::encode(begin, bl);
+ ::encode(end, bl);
+ ::encode(version, bl);
+- ::encode(using_gmt, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void pg_hit_set_info_t::decode(bufferlist::iterator& p)
+ {
+- DECODE_START(2, p);
++ DECODE_START(1, p);
+ ::decode(begin, p);
+ ::decode(end, p);
+ ::decode(version, p);
+- if (struct_v >= 2) {
+- ::decode(using_gmt, p);
+- } else {
+- using_gmt = false;
+- }
+ DECODE_FINISH(p);
+ }
+
+@@ -3893,7 +3817,6 @@ void pg_hit_set_info_t::dump(Formatter *f) const
+ f->dump_stream("begin") << begin;
+ f->dump_stream("end") << end;
+ f->dump_stream("version") << version;
+- f->dump_stream("using_gmt") << using_gmt;
+ }
+
+ void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
+diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
+index 92f6163..7557494 100644
+--- a/src/osd/osd_types.h
++++ b/src/osd/osd_types.h
+@@ -1035,7 +1035,6 @@ public:
+ HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
+ uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
+ uint32_t hit_set_count; ///< number of periods to retain
+- bool use_gmt_hitset; ///< use gmt to name the hitset archive object
+ uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote
+
+ uint32_t stripe_width; ///< erasure coded stripe size in bytes
+@@ -1064,7 +1063,6 @@ public:
+ hit_set_params(),
+ hit_set_period(0),
+ hit_set_count(0),
+- use_gmt_hitset(true),
+ min_read_recency_for_promote(0),
+ stripe_width(0),
+ expected_num_objects(0)
+@@ -1602,11 +1600,10 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
+ struct pg_hit_set_info_t {
+ utime_t begin, end; ///< time interval
+ eversion_t version; ///< version this HitSet object was written
+- bool using_gmt; ///< use gmt for creating the hit_set archive object name
+- pg_hit_set_info_t(bool using_gmt = true)
+- : using_gmt(using_gmt) {}
+- pg_hit_set_info_t(utime_t b, bool using_gmt)
+- : begin(b), using_gmt(using_gmt) {}
++
++ pg_hit_set_info_t() {}
++ pg_hit_set_info_t(utime_t b)
++ : begin(b) {}
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
1 file changed, 44 insertions(+), 12 deletions(-)
diff --git a/src/ceph-disk b/src/ceph-disk
-index 7620ff8..49bc978 100755
+index 0525945..0e12e04 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -88,6 +88,7 @@ DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be'
## MOS
ceph-disk-fix-symlinks-handling.patch
-ObjBencher-seq_read_bench-fix-locking-errors.patch
+ObjectCacher-fix-bh_read_finish-offset-logic.patch
+Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch
--- /dev/null
+#!/bin/sh
+set -e
+repo=/srv/data/Public/repos/ceph
+dist=trusty
+ceph_release=hammer
+export_dir="../build-pkg-ceph-${ceph_release}-${dist}"
+if [ ! -d "$export_dir" ]; then mkdir -p "$export_dir"; fi
+
+exec gbp buildpackage \
+ --git-ignore-new \
+ --git-pristine-tar \
+ --git-pristine-tar-commit \
+ --git-export-dir="$export_dir" \
+ --git-cleaner='git clean -dfx' \
+ --git-builder="sbuild -v --dist=${dist} --post-build-commands \"reprepro -Vb${repo} --ignore=wrongdistribution --ignore=missingfile include ${ceph_release}-${dist} %SBUILD_CHANGES\"" \
+ $@
# Enable hardening
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+DPKG_EXPORT_BUILDFLAGS = 1
+include /usr/share/dpkg/buildflags.mk
export DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)