From: Alexey Sheplyakov Date: Thu, 1 Sep 2016 14:39:56 +0000 (+0300) Subject: Update to ceph 0.94.9 X-Git-Url: https://review.fuel-infra.org/gitweb?a=commitdiff_plain;h=a78b8333d50dd6ab2a40ee44203d1d1a9ee0ad2f;p=packages%2Ftrusty%2Fceph.git Update to ceph 0.94.9 Fixes a number of bugs including but not limited to * "monitor crashes on a command without a prefix" (CVE-2016-5009), http://tracker.ceph.com/issues/16297 * "pool quota alarm is not in effect", http://tracker.ceph.com/issues/15478 * "OSD reports ENOTEMPTY and crashes", http://tracker.ceph.com/issues/14766 * "reweight-by-utilization should consider the least used OSDs first", http://tracker.ceph.com/issues/15770 * "no Last-Modified, Content-Size and X-Object-Manifest headers if no segments in DLO manifest", http://tracker.ceph.com/issues/15812 The code has been downloaded from http://download.ceph.com/tarballs/ceph-0.94.9.tar.gz Debianization has been borrowed from http://anonscm.debian.org/cgit/pkg-ceph/ceph.git/commit/?h=hammer&id=300878169869e3ca73051f6ca671d4d209d4cef6 Additional patches: - debian/patches/ceph-disk-fix-symlinks-handling.patch makes ceph-disk work with udev generated symlinks, see https://github.com/ceph/ceph/pull/7123 - debian/patches/ObjectCacher-fix-bh_read_finish-offset-logic.patch, fixes RBD cache data corruption, http://tracker.ceph.com/issues/17545 - debian/patches/Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch reverts several upstream commits which breaks the osdmap compatibility between 0.94.6 and 0.94.7 (and newer), see http://tracker.ceph.com/issues/17386 and references therein for more details for more details Change-Id: I67b34197896808f3c942d00dd35825bd5d68fb0f --- diff --git a/ceph/.gitignore b/ceph/.gitignore new file mode 100644 index 00000000..b408e6cd --- /dev/null +++ b/ceph/.gitignore @@ -0,0 +1 @@ +/.pc diff --git a/ceph/AUTHORS b/ceph/AUTHORS index 6d200207..5244daf7 100644 --- a/ceph/AUTHORS +++ b/ceph/AUTHORS @@ -5,6 +5,7 @@ Abhishek Lekshmanan Accela Zhao Adam C. Emerson Adam Crume +Adam Kupczyk Adam Manzanares Adam Spiers Adam Twardowski @@ -38,6 +39,7 @@ Anols Anton Aksola Anton Blanchard apovzner +Aran85 Ariela Aristoteles Neto Armando Segnini @@ -52,6 +54,7 @@ Billy Olsen BJ Lougee Bjørnar Ness Blaine Gardner +blinke Bo Cai Boris Ranto Brad Hubbard @@ -88,6 +91,7 @@ Colin P. McCabe Dan Chai Daniel Gollub Daniel Gryniewicz +Daniel Gryniewicz Daniel J. Hofmann Dan Mick Dan Mick @@ -110,6 +114,7 @@ Dongmao Zhang Dongsu Park Dong Yuan Douglas Fuller +Dunrong Huang dwj192 Eleanor Cawthon Emily Popper @@ -142,6 +147,7 @@ Guangliang Zhao Guang Yang guce Guilhem Lettron +Gu Zhongyan Haifeng Liu Hannes Reinecke Hannu Valtonen @@ -161,6 +167,7 @@ Ian Holsman Igor Fedotov Ilya Dryomov Ilya Dryomov +Ira Cooper islepnev James Page James Ryan Cresawn @@ -170,6 +177,9 @@ Jason Dillaman Javier M. Mellid Jeff Weber Jenkins Build Slave User +Jenkins Build Slave User +Jenkins Build Slave User +Jenkins Build Slave User Jenkins Jenkins Jens-Christian Fischer @@ -242,6 +252,7 @@ marnberg Martin Ettl Matt Benjamin Matt Benjamin +Matt Benjamin Matthew Roy Matthew Wodrich Matt Richards @@ -261,9 +272,11 @@ Moritz Möller Mouad Benchchaoui Mykola Golub Mykola Golub +Mykola Golub nairolf21 Nathan Cutler Nathan Cutler +Nathan Cutler Neil Horman Neil Levine Nikola Kotur @@ -297,6 +310,7 @@ rca renhwztetecs riccardo80 Riccardo Ferretti +Richard W.M. Jones ritz303 Roald J. van Loon RobertJansen1 @@ -306,6 +320,7 @@ Rohan Mars Roman Haritonov Ron Allred Rongze Zhu +root root Ross Turk Ross Turk @@ -328,8 +343,11 @@ Sean Channel Sebastien Han Sebastien Ponce Sharif Olorin +shawn Shawn Edwards shishir gowda +shun-s +shun-s Shu, Xinxin Shylesh Kumar Simone Gotti @@ -376,6 +394,7 @@ Venky Shankar Vicente Cheng Vikhyat Umrao Viktor Suprun +Vitja Makarov Volker Assmann VRan Liu Vu Pham @@ -395,9 +414,11 @@ Xan Peng Xiaowei Chen Xiaoxi Chen xiexingguo <258156334@qq.com> +xie xingguo Xihui He Xing Lin Xinze Chi +Xinze Chi Xinze Chi Xiong Yiliang yangruifeng @@ -413,5 +434,6 @@ Yunchuan Wen Yuri Weinstein Zhe Zhang Zhiqiang Wang +Zhi Zhang Zhi Zhang zqkkqz diff --git a/ceph/ChangeLog b/ceph/ChangeLog index 273c1d62..d5039a15 100644 --- a/ceph/ChangeLog +++ b/ceph/ChangeLog @@ -1,11 +1,290 @@ -e832001 (HEAD, tag: v0.94.6, origin/hammer) 0.94.6 +fe6d859 (HEAD, tag: v0.94.9, origin/hammer) 0.94.9 +27d8055 Revert "moved to use boost uuid implementation, based on commit 4fe89a7b14c97b2ed7f357132901beb2bdcec551" +21f6f1d Revert "uuid: use boost::random:random_device" +a219cf5 doc: release-notes.rst: add missing line to v0.94.8 +a6ba101 doc: add missing changelogs up to 0.94.8 +f3dad33 doc: release-notes: add missing hammer releases +838cd35 (tag: v0.94.8) 0.94.8 +5248929 rocksdb: disable tcmalloc if disabled +fdfcd9b ceph.spec: respect CEPH_EXTRA_CONFIGURE_ARGS +d5274a3 rgw: fix subuser rm command failure +f963774 rgw: add a method to purge all associate keys when removing a subuser +0d4b601 doc: fix by-parttypeuuid in ceph-disk(8) nroff +a3003f6 rgw: reset return code in when iterating over the bucket the objects +64211fa rgw: fix compilation +3e45c6b rgw: add bucket_quota to RGWRegionMap::encode_json +699b7c8 rgw: Have a flavor of bucket deletion to bypass GC and to trigger object deletions async. +81aef60 rgw: remove bucket index objects when deleting the bucket +23498a9 mon/OSDMonitor: avoid potential expensive grace calculation +1b6f6f2 mon/OSDMonitor: improve reweight_by_utilization() logic +474abb8 OSDMonitor: drop pg temp from sources other than the current primary +b31ac2d osd: reset session->osdmap if session is not waiting for a map anymore +3a30ffc qa: Add test for #13829 +f70e4ad common: Allow config set with negative value +0498969 log: do not repeat errors to stderr +2633ec3 mds: disallow 'open truncate' non-regular inode +3f0fb20 mds: only open non-regular inode with mode FILE_MODE_PIN +2c18015 rgw: fix multi-delete query param parsing. +8a39e5e configure: Add -D_LARGEFILE64_SOURCE to Linux build. +3bb248b replcatedBackend: delete one useless op->mark_started as there are two in ReplicatedBackend::sub_op_modify_impl delete one mark_start event as there are two same op->mark_started in ReplicatedBackend::sub_op_modify_impl Fixes: http://tracker.ceph.com/issues/16572 +ed4ca7c rgw: Set Access-Control-Allow-Origin to a Asterisk if allowed in a rule +b78a1be mon: Monitor: validate prefix on handle_command() +850881c rgw: fix subuser rm command failure +055427c Pipe: take a ref to existing while we are waiting +24cc4f9 rgw: check for -ERR_NOT_MODIFIED in rgw_rest_s3.cc +7dbace5 erasure-code: s/unreocvery/unfound/ +e726f21 test: add test-case for repair unrecovery-ec pg. +40b1c2b osd: Remove the duplicated func MissingLoc::get_all_missing. +47d5dfc osd: Fix ec pg repair endless when met unrecover object. +187d308 uuid: use boost::random:random_device +174de7f moved to use boost uuid implementation, based on commit 4fe89a7b14c97b2ed7f357132901beb2bdcec551 +5cd922c qa/workunits/rbd: respect RBD_CREATE_ARGS environment variable +1ac920b rgw: fix identification of canceled operation +a38f157 rgw: identify racing writes when using copy-if-newer +02f6d8a rgw: translate http error 304 to appropriate status +7319d76 rgw: fix if-modified-since boundary +5e4de5a rgw: add rgwx-copy-if-newer http header +006ea56 Revert "hammer: Scrub error: 0/1 pinned" +c294bd3 ReplicatedPG: adjust num_pinned in _delete_oid +43d1b92 test: Fix grouping of mon_cephdf_commands by moving to MON_TESTS +300c111 rgw: convert plain object to versioned (with null version) when removing +4eea92b rgw: handle stripe transition when flushing final pending_data_bl +f6076dd mds: wrongly treat symlink inode as normal file/dir when symlink inode is stale on kcephfs +ce313cd rgw: handle errors properly during GET on Swift's DLO. +410ff15 osdc/Objecter: upper bound watch_check result +d3eae0a osd: fix omap digest compare when scrub +dd29310 rgw: keep track of written_objs correctly +c2ea6db osd: remove all stale osdmaps in handle_osd_map() +ac0340a osd: populate the trim_thru epoch using MOSDMap.oldest_map +bb5e015 osd: dump full map bl at 20 when crc doesn't match +5057c34 obj_bencher: cosmetic display fixes +6d8ad0e common: Add space between timestamp and "min lat:" in bench output +3184998 [MON] Fixed calculation of %USED. Now it is shows (space used by all replicas)/(raw space available on OSDs). Before it was (size of pool)/(raw space available on OSDs). +fed256e mon: add a column 'RAW USED' for ceph df detail +139691c src/test/objectstore/store_test.cc: fix shards for new tests +221efb0 doc: s/by-parttype-uuid/by-parttypeuuid/ in ceph-disk +d56bdf9 (tag: v0.94.7) 0.94.7 +62f4fbe store_test: improve synthetic coverage of the ec stash case +b6bc9cb store_test: improve synthetic test coverage of long object names +ec74c12 TestRados: make half the objects have long names +9d1ee7c LFNIndex::lfn_translate: consider alt attr as well +6b821cc LFNIndex::created: fix return value with failure injection +f500435 store_test: add reproducer for #14766 +cbd5aaf osd/PG: update info.stats.* mappings on split +d1ab71f hammer: rgw: S3: set EncodingType in ListBucketResult +df4eadc rados: Add units to rados bench output +76c33de OSDMonitor: avoid underflow in reweight-by-utilization if max_change=1 +d96086a PGLog::rewind_divergent_log: fix rollback_info_trimmed_to before index() +e79162d TestPGLog: add test for 13965 +fb1b40f osd/Replicated: For CEPH_OSD_OP_WRITE, set data digest. +f024259 osd/ReplicatedPG: For obj has omap, it mean have omap data or omap header or have both. +7b3f1da mon/MonClient: fix shutdown race +ec02d8b PG: set epoch_created and parent_split_bits for child pg +049bc8a ceph-fuse: double decreased the count to trim caps +e20df80 osd/ReplicatedPG: make handle_watch_timeout no-op if !active +3cb72dd mon/OSDMonitor.cc: fix UNINTENDED_INTEGER_DIVISION +aab3a40 hammer: rbd snap rollback: restore the link to parent +3c03eee rgw:bucket link now set the bucket.instance acl +488a787 ECBackend: send subop writes and reads at normal priority +a2e7ca1 common/Cycles: Do not initialize Cycles globally. +ca0beef osd/OSD: fix build_past_intervals_parallel +fce7902 osd: When generating past intervals due to an import end at pg epoch +2c97cb3 rgw: fix compiling error +2aa1ea6 rgw: Multipart ListPartsResult ETag quotes +365f21b tests: be more generous with test timeout +c722d00 rgw: user quota may not adjust on bucket removal +77a4ed0 ceph.spec.in: disable lttng and babeltrace explicitly +97f474f cls_rbd: protect against excessively large object maps +ac3569c hammer: monclient: avoid key renew storm on clock skew +20f300e rgw: Do not send a Content-Length header on a 304 response +e53751d rgw: Do not send a Content-Type on a '304 Not Modified' response +19dbc25 rgw: dump_status() uses integer +c79b481 rgw: move status_num initialization into constructor +ceb8e19 rgw: Do not send a Content-Length header on status 204 +3ecdedd mds: fix stray purging in 'stripe_count > 1' case +f28477c rgw: do not abort when user passed bad parameters to set quota +9786394 rgw: do not abort when user passed bad parameters to set metadata +f8d2abd osd/osd_types: encode pg_pool_t the old way +720a090 mon: disable gmt_hitset if not supported +7aec079 osd: do not let OSD_HITSET_GMT reuse the feature bit +3704341 osd: Decode use_gmt_hitset with a unique version +64bca2a mon: print use_gmt_hitset in "ceph osd pool get" +87df212 mon: add "ceph osd pool set $pool use_gmt_hitset true" cmd +0392404 osd: use GMT time for the object name of hitsets +744e9f8 test/bufferlist: do not expect !is_page_aligned() after unaligned rebuild +0830275 osd/PG: fix generate_past_intervals +7eae05e osd/ReplicatedPG: do not proxy read *and* process op locally +be4a9fe osd/OSDMap: fix typo in summarize_mapping_stats +2072a53 qa/workunits: remove 'mds setmap' from workunits +01672b4 mon: Monitor: get rid of weighted clock skew reports +f90b8bc mon: Monitor: adaptative clock skew detection interval +57fd7f8 test/librados/test.cc: clean up EC pools' crush rules too +d4cf190 keyring permissions for mon daemon +1b922e5 test/pybind/test_ceph_argparse: fix reweight-by-utilization tests +06a2a75 man/8/ceph.rst: remove invalid option for reweight-by-* +241f762 mon: remove range=100 from reweight-by-* commands +55ad2c7 mon: make max_osds an optional arg +f13cdea mon: make reweight max_change default configurable +f4b4ef7 mon/OSDMonitor: fix indentation +76eb3c8 qa/workunits/cephtool/test.sh: test reweight-by-x commands +9a9d147 osd/MonCommand: add/fix up 'osd [test-]reweight-by-{pg,utilization}' +6ec676d mon: add 'osd utilization' command +94134d9 osd/OSDMap: add summarize_mapping_stats +932f75d mon: make reweight-by-* max_change an argument +d8372ce osd: add mon_reweight_max_osds to limit reweight-by-* commands +6a422b2 osd: add mon_reweight_max_change option which limits reweight-by-* +d3635b7 test: add simple test for new reweight-by-* options +e993851 osd: add sure and no-increasing options to reweight-by-* +281d63d librbd: complete cache reads on cache's dedicate thread +621e3ae test: reproducer for writeback CoW deadlock +38b9c0b osdc/Objecter: call notify completion only once +f794ada tests: Add TEST_no_segfault_for_bad_keyring to test/mon/misc.sh +94da46b tests: make sure no segfault occurs when using some bad keyring +a371c0f auth: fix a crash issue due to CryptoHandler::create() failed +af5da4f auth: fix double PK11_DestroyContext() if PK11_DigestFinal() failed +c3f031a ceph-objectstore-tool, osd: Fix import handling +647723e tools, test: Add ceph-objectstore-tool to operate on the meta collection +d875620 common/obj_bencher.cc: make verify error fatal +04fe951 test/test_rados_tool.sh: force rados bench rand and seq +6a6754f hammer: tools: fix race condition in seq/rand bench (part 2) +3a5b102 hammer: tools: fix race condition in seq/rand bench (part 1) +c4ba93a client: use fuse_req_getgroups() to get group list +a84ed87 client: use thread local data to track fuse request +e7f299a client/Client.cc: remove only once used variable +16e3e2f client/Client.cc: fix realloc memory leak +b13ddc0 client: added permission check based on getgrouplist +562c0a9 configure.ac: added autoconf check for getgrouplist +e014ea8 init-ceph: check if /lib/lsb/init-functions exists +5726463 packaging: lsb_release build and runtime dependency +c63baeb global: do not start two daemons with a single pid-file (part 2) +9282c1d ceph-objectstore-tool: Add dry-run checking to ops missing it +efc2183 test: Remove redundant test output +3226615 test: osd-scrub-snaps.sh uses ceph-helpers.sh and added to make check +995a004 test: Verify replicated PG beyond just data after vstart +6afb5d3 osd: Use boost::optional instead of snap 0 for "all_clones" +750f817 test: Fix verify() used after import to also check xattr and omap +b8c9507 osd, test: When head missing a snapset, clones not an error +59fee8a test: Add test cases for xattr and omap ceph-objectstore-tool operations +0988b12 osd, test: Keep missing count and log number of missing clones +37be959 rados: Minor output changes for consistency across operations +6c51e48 test: Eliminate check for bogus "obj13/head snaps empty" error +e92505b ceph-objectstore-tool: Add new remove-clone-metadata object op +8f88b44 osd: Fix trim_object() to not crash on corrupt snapset +78b13f5 ceph-objectstore-tool: Improve object spec error handling +7b800b7 ceph-objectstore-tool: Add undocumented clear-snapset command for testing +7f398bd ceph-objectstore-tool: Add set-size command for objects +53dc87f ceph-objectstore-tool: Enhanced dump command replaces dump-info +a46fc66 test: Add some clones to ceph-objectstore-tool test +fd518e7 ceph-objectstore-tool: For corrupt objectstores, don't abort listing on errors +ad7825a ceph-objectstore-tool: Improve some error messages +26cbf14 ceph-objectstore-tool: White space fixes +0f78564 tools/rados: Improve xattr import handling so future internal xattrs ignored +c8e2772 test: Test scrubbing of snapshot problems +113d5c7 osd: Don't crash if OI_ATTR attribute is missing or corrupt +3af8f9e osd: Additional _scrub() check for snapset inconsistency +7103e74 osd: Better SnapSet scrub checking (find issues instead of asserting) +18af852 osd: Make the _scrub routine produce good output and detect errors properly +3a1b588 osd: Fix log message name of ceph-objectstore-tool +0fe3dfe ceph-objectstore-tool: add mark-complete operation +1bc8882 test: Fix failure test to find message anywhere in stderr +6ff4217 test: add test for {get,set}-inc-osdmap commands. +de80bbf test: Add debug argument to the ceph-objectstore-tool test +0643797 rados: Fix usage for "notify" command +5ba8649 test: add test for {get,set}-osdmap commands +3276258 tools, test: Some ceph-objectstore-tool error handling fixes +cfabcc1 tools/ceph-objectstore-tool: add get-inc-osdmap command +c7d0fda tools: Check for valid --op earlier so we can get a better error message +be24c50 tools/ceph-objectstore-tool: add set-inc-osdmap command +06dcf74 tools: Fix newlines in output of --op list +e44c042 tools/ceph-objectstore-tool: add "get-osdmap" command +3f9e467 tools: Fix dump-super which doesn't require pgid +c60eee1 tools/ceph-objectstore-tool: add "set-osdmap" command +cfe7d47 tools: Check and specify commands that require the pgid specification +df0e11e hobject_t: modify operator<< +6c8884b test, tools: Improve ceph-objectstore-tool import error handling and add tests +87a7f99 tools: For ec pools list objects in all shards if the pgid doesn't specify +9ca2f35 tools: clean up errors in ceph-objectstore-tool +78a59f8 test/ceph-objectstore-tool: Don't need stderr noise +eab0f24 osd: Show number of divergent_priors in log message +d58793d osd, tools: Always filter temp objects since not being exported +efc402e test/ceph-objectstore-tool: Show command that should have failed +88ac519 test: Add config changes to all tests to avoid order dependency +3d99ecd tools: Don't export temporary objects until we have persistent-temp objects +13360d3 test/ceph_objectstore_tool: Improve dump-journal testing +444ce0a ceph-objectstore-tool: Allow --pgid specified on import (for post split) +aed1c49 ceph-objectstore-tool: Invalidate pg stats when objects were skipped during pg import +af3f8ae ceph-objectstore-tool: Add dump-super to show OSDSuperblock in format specified +4dcf15b mds, include: Fix dump() numeric char array to include additional alpha chars +feecacf ceph-objectstore-tool: Add dump-journal as not requiring --pgid in usage +5e8fbb1 test: ceph_test_filejournal: Conform to test infrastructure requirements +c161cbf test: ceph_test_filejournal need to force aio because testing with a file +06d3f51 test: ceph_test_filejournal fix missing argument to FileJournal constructor +2078f63 test: ceph_test_filejournal Add check of journalq in WriteTrim test +ab893d7 test: Fix ceph-objectstore-tool test missing fd.close() +b5f2ccd test: Fix ceph-objectstore-tool test error message +848822d test: ceph-objectstore-tool: Remove duplicate debug messages, keep cmd/log/call together +771dcd9 test: ceph-objectstore-tool import after split testing +4f387b1 test: Use CEPH_DIR where appropriate +b337d67 test: Limit how long ceph-objectstore-tool test will wait for health +09cb8a4 test: Add optional arg to vstart() to provide additional args to vstart +b4ac42b test: Test ceph-objectstore-tool --op dump-journal output +729abf5 test: Pep8 fixes for ceph-objectstore-tool test +33813b6 test: Fix ceph-objectstore-tool test, overwrite OTHERFILE so second check is meaningful +f7ab316 osd: FileJournal: Add _fdump() that takes Formatter instead of ostream +99d3e17 osd: Add simple_dump() to FileJournal for unit testing +80fc57f osd: FileJournal clean-up +b8f4ea1 osd: Dump header in FileJournal::dump() +21c3c18 osd: FileJournal::read_entry() can't use a zero seq to check for corruption +288902f osd: Fix flushing in FileJournal::dump() +a935ce5 osd: Add admin socket feature set_recovery_delay +4ae3f88 ceph-objectstore-tool: For import/export --debug dump the log +cc5fa68 ceph-objectstore-tool: If object re-appears after removal, just skip it +d8ae1a9 ceph-objectstore-tool: Add --no-overwrite flag for import-rados +2dbf843 ceph-objectstore-tool: Remove list-lost because now we have --dry-run flag +3599174 ceph-objectstore-tool: Add --dry-run option +05d3b73 ceph-objectstore-tool: Add dump-info command to show object info +2d764c5 ceph-objectstore-tool: Use empty string for to specify pgmeta object +3a533d7 ceph-objectstore-tool: Add a couple of strategically placed prints +7947f4f ceph-objectstore-tool: Clean up error handling +83de86e ceph-objectstore-tool: Create section around log/missing/divergent_priors of --op log +ddfaa70 ceph-objectstore-tool: Add divergent_priors handling +add937c ceph-objectstore-tool: Add --force option which is used for import only +f332748 ceph-objectstore-tool: Fix pgid scan to skip snapdirs +3e68825 ceph-objectstore-tool: Add dump-journal op +aaff4d7 ceph-objectstore-tool: On any exit release CephContext so logging can flush +7445cf5 ceph-objectstore-tool: Check for keyvaluestore experimental feature +9da6c01 ceph-objectstore-tool: Eliminate obscure "Invalid params" error +c5ac7ce ceph-objectstore-tool: Check pgid validity earlier like we did before +18c49b6 Backport the merge commit of branch 'wip-journal-header' of git://github.com/XinzeChi/ceph +cf433ba global/pidfile: do not start two daemons with a single pid-file +b43c5b2 unittest_crypto: benchmark 100,000 CryptoKey::encrypt() calls +e832001 (tag: v0.94.6) 0.94.6 +a1fc101 crushtool: send --tree to stdout +4fb688d osd: write journal header by force when journal write close +31a2fc4 common/bit_vector: use hard-coded value for block size +3352b14 ceph.in: Notify user that 'tell' can't be used in interactive mode +14b5fea mon/LogMonitor: use the configured facility if log to syslog +10d29da os/LevelDBStore:fix bug when compact_on_mount +d5ba063 OSDMap: reset osd_primary_affinity shared_ptr when deepish_copy_from +9e0a165 OSD::consume_map: correctly remove pg shards which are no longer acting +5a450e6 mon: add mon_config_key prefix when sync full +b9a4ad9 Fixed the ceph get mdsmap assertion. 9ab5fd9 rgw-admin: document orphans commands in usage 0e1378e [backport] rgw: fix wrong etag calculation during POST on S3 bucket. 5c8d1d7 [backport] rgw: Make RGW_MAX_PUT_SIZE configurable f2ca42b doc: add orphans commands to radosgw-admin(8) e42ed6d man: rebuild manpages a8fc6a9 fsx: checkout old version until it compiles properly on miras +eb048a3 qa/workunits/post-file.sh: sudo +e9039f4 qa/workunits/post-file: pick a dir that's readable by world +902abe7 qa/workunits/post-file.sh: use /etc/default +1c8c708 librbd: ensure librados callbacks are flushed prior to destroying image +f892566 librbd: simplify IO flush handling +e5dfd3d WorkQueue: PointerWQ drain no longer waits for other queues +edf60b4 test: new librbd flatten test case +88ffcc2 ceph-disk: use blkid instead of sgdisk -i 1b02859 qa/fsstress.sh: fix 'cp not writing through dangling symlink' +f209819 [ceph-fuse] fix ceph-fuse writing to stale log file after log rotation 9109304 mon: compact full epochs also 2817ffc Check for full before changing the cached obc ae56de0 osd: recency should look at newest (not oldest) hitsets @@ -13,6 +292,8 @@ ae56de0 osd: recency should look at newest (not oldest) hitsets a5e4f70 man: document listwatchers cmd in "rados" manpage 46d626d rbd: remove canceled tasks from timer thread 24c0b27 rbd-replay: handle EOF gracefully +3d84420 PG::activate(): handle unexpected cached_removed_snaps more gracefully +ad4df3b rgw: warn on suspicious civetweb frontend parameters 70f1ba3 tools: monstore: add 'show-versions' command. 9260171 tools: ceph_monstore_tool: add inflate-pgmap command a1d5728 tools:support printing the crushmap in readable fashion. @@ -37,17 +318,28 @@ f7acd44 qa: remove legacy OS support from rbd/qemu-iotests 53742bd ceph_osd: Add required feature bits related to this branch to osd_required mask 3066231 osd: CEPH_FEATURE_CHUNKY_SCRUB feature now required 6379ff1 configure.ac: no use to add "+" before ac_ext=c +5c92d1d rgw: Add default quota config f96c812 rgw: fix reload on non Debian systems. cbb5c1f Fixing NULL pointer dereference +17d1b0d rgw: radosgw-admin bucket check --fix not work b2961ce rbd: fix bench-write 9cee89b Check that delta_sum.stats.sum.num_object_copies and delta_sum.stats.sum.num_object are greater than zero 1ab2b48 ReplicatedPG: fix sparse-read result code checking logic 86f5cf6 osd: clear pg_stat_queue after stopping pgs b0856ee osd: Test osd_find_best_info_ignore_history_les config in another assert b2f1e76 Compare parted output with the dereferenced path +df3f971 auth: return error code from encrypt/decrypt; make error string optional +224bb39 auth: optimize crypto++ key context +f11718d auth/Crypto: optimize libnss key +d1b6096 auth: refactor crypto key context +3249f48 auth/cephx: optimize signature check +51eaf98 auth/cephx: move signature calc into helper +c240da9 auth/Crypto: avoid memcpy on libnss crypto operation +86cc0f0 auth: make CryptoHandler implementations totally private 5264bc6 mon: OSDMonitor: do not assume a session exists in send_incremental() 4d0b9a1 log: Log.cc: Assign LOG_DEBUG priority to syslog calls 26e832e librbd: fix merge-diff for >2GB diff-files +f04e007 osd: log inconsistent shard sizes a9d3f07 osd/osd_types: skip encoding newer object_info_t fields if they are unused 1548a3f osd/ReplicatedPG: do not set local_mtime on non-tiered pool 98bdb09 osd/PGBackend: use mtime for digest decision if local_mtime is empty @@ -71,6 +363,7 @@ d02beff tracing: dynamic tracepoint provider helper e53d66e packaging: add new tracepoint probe shared libraries bb7c0f5 ceph.spec.in: add new tracepoint probe shared libraries e1da271 lttng: move tracepoint probes to dynamic libraries +b2393dc client: add InodeRef.h to make dist 8358fb8 revert: osd: use GMT time for hitsets 4420929 rgw: fix modification to index attrs when setting acls 8378aaf build/ops: rbd-replay moved from ceph-test-dbg to ceph-common-dbg @@ -83,6 +376,14 @@ a322317 librbd: resize should only update image size within header 6a40e4f ceph.spec.in: lttng in SLES12 only e508a44 ceph.spec.in: fix lttng/babeltrace conditionals 19c9546 packaging: move rbd-replay* to ceph-common +fa79bd9 client: use smart pointer to track 'cwd' and 'root_parents' +fcafc65 client: convert Inode::snapdir_parent to smart pointer +78cca4e client: use smart pointer to track temporary inode reference +bad6f33 client: convert CapSnap::in to smart pointer +4bb48ee client: convert Fh::inode to smart pointer +5bebb3a client: use smart pointers in MetaRequest +e18f1ae client: convert Dentry::inode to smart pointer +a7f6d2f client: hold reference for returned inode 3d3595f krbd: remove deprecated --quiet param from udevadm 4d81cd1 run_cmd: close parent process console file descriptors 2052187 init-rbdmap: Rewrite to use logger + clean-up @@ -3354,7 +3655,7 @@ fa0bd06 ceph-disk: bootstrap-osd keyring ignores --statedir 19be358 PG::actingset should be used when checking the number of acting OSDs for a given PG. Signed-off-by: Guang Yang 8253ead osdc/Objecter: use SafeTimer; make callbacks race-tolerant 6c37984 mailmap: Yehuda Sadeh name normalization -beff616 ceph-disk: set guid if reusing a journal partition +beff616f ceph-disk: set guid if reusing a journal partition 50e8040 tools: rados put /dev/null should write() and not create() 0b0a373 mailmap: update email address 188370a doc/release-notes: fix attributions for 8702 fix @@ -5078,7 +5379,7 @@ e720314 doc: Updated the OS Recommendations for Firefly. 2e3302c doc: Updated the example configuration. 5a31df2 doc: Updated doc for more recent versions. 2eab1c1 Update RBD doc for OpenStack -a290d34 test_librbd_fsx: fix sign-compare gcc warning +a290d349 test_librbd_fsx: fix sign-compare gcc warning 40c48bc qa: add script to test krbd setting ro/rw ioctl b2542f8 rgw: set a default data extra pool name 94c8f70 doc: Made mention of "incomplete" status. @@ -10425,7 +10726,7 @@ ffe7045 install rules for init-rbdmap cfc1f2e rgw: modify RGWBucketInfo::creation_time encoding / decoding 4089001 rgw: Fix return value for swift user not found c73a24b rgw: end-marker serves as last value, not as upperbound -c414030 rgw: relax marker requirements for log apis +c4140303 rgw: relax marker requirements for log apis b21a41a rgw: update log APIs to use markers ce7d816 rgw: cls_log_entry has id field 064e92f Makefile.am: some more makefile rules fixes @@ -16404,7 +16705,7 @@ f69d025 conf: make dup lines override previous value 5f3ef77 mon: make pool snap creation ops idempotent 53aa959 objecter: return ENOENT/EEXIST on pool snap delete/create 507f99e librados: make snap create/destroy handle client-side errors -3715d20 mon: check for invalid pool snap creates in preprocess_op, too +3715d205 mon: check for invalid pool snap creates in preprocess_op, too 640e5fd qa: simple tests for 'ceph osd create|rm' commands 6f7837a mon: make 'osd rm ...' idempotent 4788567 qa: simple test for pool create/delete commands @@ -19888,7 +20189,7 @@ a50fbe2 PG: merge_old_entry: merged delete might not be in missing e07b956 rgw: implement some missing swift api, other swift fixes 5790a36 rgw: encoded swift key contains full user name f883e63 rgw: multiple swift keys per user -9b18e55 PG,OSD: clean up xlist::items on pg removal +9b18e55e PG,OSD: clean up xlist::items on pg removal b43b864 osd: fix race between op requeueing and _dispatch f7ec9f7 thread: whitespace fc3aac6 filestore: clean up error output @@ -27929,7 +28230,7 @@ bc9b863 kclient: include fs/{Kconfig,Makefile} in patchset 3a3ccd8 kclient: checkpatch cleanups 522f570 mds: fix default layout settings 38dbaa5 (tag: v0.16) v0.16 -e678fbc msgr: authorizer get/verify callbacks +e678fbc1 msgr: authorizer get/verify callbacks faa5fb5 msgr: get_authorizer hook? 56f45b4 objecter: Session type 8b04f9a auth: authorizer for osd diff --git a/ceph/ceph.spec b/ceph/ceph.spec index f0bd3ad4..0648d498 100644 --- a/ceph/ceph.spec +++ b/ceph/ceph.spec @@ -1,6 +1,11 @@ %bcond_with ocf %bcond_without cephfs_java +# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12 +%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315 +%bcond_without lttng +%endif + %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600)) %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")} %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")} @@ -8,13 +13,8 @@ %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} -# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12 -%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315 -%global _with_lttng 1 -%endif - Name: ceph -Version: 0.94.6 +Version: 0.94.9 Release: 0%{?dist} Epoch: 1 Summary: User space components of the Ceph file system @@ -103,7 +103,10 @@ BuildRequires: %insserv_prereq BuildRequires: mozilla-nss-devel BuildRequires: keyutils-devel BuildRequires: libatomic-ops-devel -%else +Requires: lsb-release +BuildRequires: lsb-release +%endif +%if 0%{?fedora} || 0%{?rhel} Requires: gdisk BuildRequires: nss-devel BuildRequires: keyutils-libs-devel @@ -114,9 +117,11 @@ Requires(preun):chkconfig Requires(preun):initscripts BuildRequires: gperftools-devel Requires: python-flask +Requires: redhat-lsb-core +BuildRequires: redhat-lsb-core %endif # lttng and babeltrace for rbd-replay-prep -%if 0%{?_with_lttng} +%if %{with lttng} %if 0%{?fedora} || 0%{?rhel} BuildRequires: lttng-ust-devel BuildRequires: libbabeltrace-devel @@ -461,7 +466,7 @@ done %endif ./autogen.sh -MY_CONF_OPT="" +MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS" MY_CONF_OPT="$MY_CONF_OPT --with-radosgw" @@ -481,6 +486,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` %endif --with-librocksdb-static=check \ $MY_CONF_OPT \ +%if %{without lttng} + --without-lttng \ + --without-babeltrace \ +%endif %{?_with_ocf} \ CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS" @@ -627,7 +636,7 @@ fi %{_libdir}/rados-classes/libcls_version.so* %dir %{_libdir}/ceph/erasure-code %{_libdir}/ceph/erasure-code/libec_*.so* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/libos_tp.so* %{_libdir}/libosd_tp.so* %endif @@ -680,7 +689,7 @@ fi %{_bindir}/rbd %{_bindir}/rbd-replay %{_bindir}/rbd-replay-many -%if 0%{?_with_lttng} +%if %{with lttng} %{_bindir}/rbd-replay-prep %endif %{_bindir}/ceph-post-file @@ -786,7 +795,7 @@ fi %files -n librados2 %defattr(-,root,root,-) %{_libdir}/librados.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so.* %endif @@ -809,7 +818,7 @@ fi %{_includedir}/rados/rados_types.hpp %{_includedir}/rados/memory.h %{_libdir}/librados.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so %endif @@ -841,7 +850,7 @@ fi %files -n librbd1 %defattr(-,root,root,-) %{_libdir}/librbd.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so.* %endif @@ -861,7 +870,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_includedir}/rbd/librbd.hpp %{_includedir}/rbd/features.h %{_libdir}/librbd.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so %endif diff --git a/ceph/ceph.spec.in b/ceph/ceph.spec.in index 3f9a126c..befd9114 100644 --- a/ceph/ceph.spec.in +++ b/ceph/ceph.spec.in @@ -1,6 +1,11 @@ %bcond_with ocf %bcond_without cephfs_java +# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12 +%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315 +%bcond_without lttng +%endif + %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600)) %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")} %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")} @@ -8,11 +13,6 @@ %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} -# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12 -%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315 -%global _with_lttng 1 -%endif - Name: ceph Version: @VERSION@ Release: @RPM_RELEASE@%{?dist} @@ -103,7 +103,10 @@ BuildRequires: %insserv_prereq BuildRequires: mozilla-nss-devel BuildRequires: keyutils-devel BuildRequires: libatomic-ops-devel -%else +Requires: lsb-release +BuildRequires: lsb-release +%endif +%if 0%{?fedora} || 0%{?rhel} Requires: gdisk BuildRequires: nss-devel BuildRequires: keyutils-libs-devel @@ -114,9 +117,11 @@ Requires(preun):chkconfig Requires(preun):initscripts BuildRequires: gperftools-devel Requires: python-flask +Requires: redhat-lsb-core +BuildRequires: redhat-lsb-core %endif # lttng and babeltrace for rbd-replay-prep -%if 0%{?_with_lttng} +%if %{with lttng} %if 0%{?fedora} || 0%{?rhel} BuildRequires: lttng-ust-devel BuildRequires: libbabeltrace-devel @@ -461,7 +466,7 @@ done %endif ./autogen.sh -MY_CONF_OPT="" +MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS" MY_CONF_OPT="$MY_CONF_OPT --with-radosgw" @@ -481,6 +486,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` %endif --with-librocksdb-static=check \ $MY_CONF_OPT \ +%if %{without lttng} + --without-lttng \ + --without-babeltrace \ +%endif %{?_with_ocf} \ CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS" @@ -627,7 +636,7 @@ fi %{_libdir}/rados-classes/libcls_version.so* %dir %{_libdir}/ceph/erasure-code %{_libdir}/ceph/erasure-code/libec_*.so* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/libos_tp.so* %{_libdir}/libosd_tp.so* %endif @@ -680,7 +689,7 @@ fi %{_bindir}/rbd %{_bindir}/rbd-replay %{_bindir}/rbd-replay-many -%if 0%{?_with_lttng} +%if %{with lttng} %{_bindir}/rbd-replay-prep %endif %{_bindir}/ceph-post-file @@ -786,7 +795,7 @@ fi %files -n librados2 %defattr(-,root,root,-) %{_libdir}/librados.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so.* %endif @@ -809,7 +818,7 @@ fi %{_includedir}/rados/rados_types.hpp %{_includedir}/rados/memory.h %{_libdir}/librados.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so %endif @@ -841,7 +850,7 @@ fi %files -n librbd1 %defattr(-,root,root,-) %{_libdir}/librbd.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so.* %endif @@ -861,7 +870,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_includedir}/rbd/librbd.hpp %{_includedir}/rbd/features.h %{_libdir}/librbd.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so %endif diff --git a/ceph/configure b/ceph/configure index c2278d73..2de57c21 100755 --- a/ceph/configure +++ b/ceph/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for ceph 0.94.6. +# Generated by GNU Autoconf 2.69 for ceph 0.94.9. # # Report bugs to . # @@ -590,8 +590,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='ceph' PACKAGE_TARNAME='ceph' -PACKAGE_VERSION='0.94.6' -PACKAGE_STRING='ceph 0.94.6' +PACKAGE_VERSION='0.94.9' +PACKAGE_STRING='ceph 0.94.9' PACKAGE_BUGREPORT='ceph-devel@vger.kernel.org' PACKAGE_URL='' @@ -1542,7 +1542,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures ceph 0.94.6 to adapt to many kinds of systems. +\`configure' configures ceph 0.94.9 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1613,7 +1613,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of ceph 0.94.6:";; + short | recursive ) echo "Configuration of ceph 0.94.9:";; esac cat <<\_ACEOF @@ -1786,7 +1786,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -ceph configure 0.94.6 +ceph configure 0.94.9 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2862,7 +2862,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by ceph $as_me 0.94.6, which was +It was created by ceph $as_me 0.94.9, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -4979,7 +4979,7 @@ fi # Define the identity of the package. PACKAGE='ceph' - VERSION='0.94.6' + VERSION='0.94.9' cat >>confdefs.h <<_ACEOF @@ -12883,7 +12883,7 @@ fi # Define the identity of the package. PACKAGE='ceph' - VERSION='0.94.6' + VERSION='0.94.9' cat >>confdefs.h <<_ACEOF @@ -13153,6 +13153,7 @@ $as_echo "#define DARWIN 1" >>confdefs.h ;; linux*) linux="yes" + CFLAGS="-D_LARGEFILE64_SOURCE ${CFLAGS}" ;; freebsd*) freebsd="yes" @@ -20028,7 +20029,7 @@ else JAVA_TEST=Test.java CLASS_TEST=Test.class cat << \EOF > $JAVA_TEST -/* #line 20031 "configure" */ +/* #line 20032 "configure" */ public class Test { } EOF @@ -22744,6 +22745,19 @@ $as_echo "#define CEPH_HAVE_FALLOCATE /**/" >>confdefs.h fi +# getgrouplist +for ac_func in getgrouplist +do : + ac_fn_c_check_func "$LINENO" "getgrouplist" "ac_cv_func_getgrouplist" +if test "x$ac_cv_func_getgrouplist" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_GETGROUPLIST 1 +_ACEOF + +fi +done + + # # Test for time-related `struct stat` members. # @@ -24700,7 +24714,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by ceph $as_me 0.94.6, which was +This file was extended by ceph $as_me 0.94.9, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -24766,7 +24780,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -ceph config.status 0.94.6 +ceph config.status 0.94.9 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/ceph/configure.ac b/ceph/configure.ac index 67ed08db..65939b74 100644 --- a/ceph/configure.ac +++ b/ceph/configure.ac @@ -8,7 +8,7 @@ AC_PREREQ(2.59) # VERSION define is not used by the code. It gets a version string # from 'git describe'; see src/ceph_ver.[ch] -AC_INIT([ceph], [0.94.6], [ceph-devel@vger.kernel.org]) +AC_INIT([ceph], [0.94.9], [ceph-devel@vger.kernel.org]) # Create release string. Used with VERSION for RPMs. RPM_RELEASE=0 @@ -51,6 +51,7 @@ darwin*) ;; linux*) linux="yes" + CFLAGS="-D_LARGEFILE64_SOURCE ${CFLAGS}" ;; freebsd*) freebsd="yes" @@ -932,6 +933,9 @@ AC_CHECK_FUNC([fallocate], [AC_DEFINE([CEPH_HAVE_FALLOCATE], [], [fallocate(2) is supported])], []) +# getgrouplist +AC_CHECK_FUNCS([getgrouplist]) + # # Test for time-related `struct stat` members. # diff --git a/ceph/man/ceph-disk.8 b/ceph/man/ceph-disk.8 index ac87bdb1..4256eac1 100644 --- a/ceph/man/ceph-disk.8 +++ b/ceph/man/ceph-disk.8 @@ -190,7 +190,7 @@ ceph\-disk activate\-journal [\-\-activate\-key PATH] [\-\-mark\-init INITSYSTEM .SS activate\-all .sp Activate all tagged OSD partitions. \fBactivate\-all\fP relies on -\fB/dev/disk/by\-parttype\-uuid/$typeuuid.$uuid\fP to find all partitions. Special +\fB/dev/disk/by\-parttypeuuid/$typeuuid.$uuid\fP to find all partitions. Special \fBudev\fP rules are installed to create these links. It is triggered on ceph service start or run directly. .sp diff --git a/ceph/src/.git_version b/ceph/src/.git_version index 20215e8e..e8cbc574 100644 --- a/ceph/src/.git_version +++ b/ceph/src/.git_version @@ -1,2 +1,2 @@ -e832001feaf8c176593e0325c8298e3f16dfb403 -v0.94.6 +fe6d859066244b97b24f09d46552afc2071e6f90 +v0.94.9 diff --git a/ceph/src/Makefile.in b/ceph/src/Makefile.in index ff34f7fa..5e7bdb06 100644 --- a/ceph/src/Makefile.in +++ b/ceph/src/Makefile.in @@ -380,7 +380,8 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \ @ENABLE_CLIENT_TRUE@ client/SyntheticClient.h \ @ENABLE_CLIENT_TRUE@ client/Trace.h \ @ENABLE_CLIENT_TRUE@ client/ioctl.h \ -@ENABLE_CLIENT_TRUE@ client/ObjecterWriteback.h +@ENABLE_CLIENT_TRUE@ client/ObjecterWriteback.h \ +@ENABLE_CLIENT_TRUE@ client/InodeRef.h @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@am__append_61 = libclient_fuse.la @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@am__append_62 = client/fuse_ll.h @@ -6002,8 +6003,8 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \ client/Dir.h client/Fh.h client/Inode.h client/MetaRequest.h \ client/MetaSession.h client/ClientSnapRealm.h \ client/SyntheticClient.h client/Trace.h client/ioctl.h \ - client/ObjecterWriteback.h client/fuse_ll.h global/pidfile.h \ - global/global_init.h global/global_context.h \ + client/ObjecterWriteback.h client/InodeRef.h client/fuse_ll.h \ + global/pidfile.h global/global_init.h global/global_context.h \ global/signal_handler.h json_spirit/json_spirit.h \ json_spirit/json_spirit_error_position.h \ json_spirit/json_spirit_reader.h \ @@ -7027,10 +7028,10 @@ check_SCRIPTS = $(am__append_117) test/ceph_objectstore_tool.py \ test/mon/osd-pool-create.sh test/mon/misc.sh \ test/mon/osd-crush.sh test/mon/osd-erasure-code-profile.sh \ test/mon/mkfs.sh test/osd/osd-scrub-repair.sh \ - test/osd/osd-config.sh test/osd/osd-bench.sh \ - test/osd/osd-copy-from.sh test/mon/mon-handle-forward.sh \ - $(am__append_181) $(am__append_182) \ - test/pybind/test_ceph_argparse.py + test/osd/osd-scrub-snaps.sh test/osd/osd-config.sh \ + test/osd/osd-bench.sh test/osd/osd-copy-from.sh \ + test/mon/mon-handle-forward.sh $(am__append_181) \ + $(am__append_182) test/pybind/test_ceph_argparse.py ################################## AM_COMMON_CPPFLAGS = \ @@ -22239,6 +22240,13 @@ test/osd/osd-scrub-repair.sh.log: test/osd/osd-scrub-repair.sh --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) +test/osd/osd-scrub-snaps.sh.log: test/osd/osd-scrub-snaps.sh + @p='test/osd/osd-scrub-snaps.sh'; \ + b='test/osd/osd-scrub-snaps.sh'; \ + $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ + --log-file $$b.log --trs-file $$b.trs \ + $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ + "$$tst" $(AM_TESTS_FD_REDIRECT) test/osd/osd-config.sh.log: test/osd/osd-config.sh @p='test/osd/osd-config.sh'; \ b='test/osd/osd-config.sh'; \ diff --git a/ceph/src/acconfig.h.in b/ceph/src/acconfig.h.in index 2e8dbfd5..989f9c56 100644 --- a/ceph/src/acconfig.h.in +++ b/ceph/src/acconfig.h.in @@ -68,6 +68,9 @@ /* Define to 1 if you have the `fuse_getgroups' function. */ #undef HAVE_FUSE_GETGROUPS +/* Define to 1 if you have the `getgrouplist' function. */ +#undef HAVE_GETGROUPLIST + /* we have a recent yasm and are x86_64 */ #undef HAVE_GOOD_YASM_ELF64 diff --git a/ceph/src/auth/Crypto.cc b/ceph/src/auth/Crypto.cc index e401c960..7d7e4d50 100644 --- a/ceph/src/auth/Crypto.cc +++ b/ceph/src/auth/Crypto.cc @@ -62,160 +62,272 @@ uint64_t get_random(uint64_t min_val, uint64_t max_val) return r; } + // --------------------------------------------------- -int CryptoNone::create(bufferptr& secret) -{ - return 0; -} +class CryptoNoneKeyHandler : public CryptoKeyHandler { +public: + int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + out = in; + return 0; + } + int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + out = in; + return 0; + } +}; + +class CryptoNone : public CryptoHandler { +public: + CryptoNone() { } + ~CryptoNone() {} + int get_type() const { + return CEPH_CRYPTO_NONE; + } + int create(bufferptr& secret) { + return 0; + } + int validate_secret(const bufferptr& secret) { + return 0; + } + CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error) { + return new CryptoNoneKeyHandler; + } +}; -int CryptoNone::validate_secret(bufferptr& secret) -{ - return 0; -} -void CryptoNone::encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const -{ - out = in; -} +// --------------------------------------------------- -void CryptoNone::decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const -{ - out = in; -} +class CryptoAES : public CryptoHandler { +public: + CryptoAES() { } + ~CryptoAES() {} + int get_type() const { + return CEPH_CRYPTO_AES; + } + int create(bufferptr& secret); + int validate_secret(const bufferptr& secret); + CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error); +}; -// --------------------------------------------------- #ifdef USE_CRYPTOPP # define AES_KEY_LEN ((size_t)CryptoPP::AES::DEFAULT_KEYLENGTH) # define AES_BLOCK_LEN ((size_t)CryptoPP::AES::BLOCKSIZE) -#elif USE_NSS -// when we say AES, we mean AES-128 -# define AES_KEY_LEN 16 -# define AES_BLOCK_LEN 16 -static void nss_aes_operation(CK_ATTRIBUTE_TYPE op, const bufferptr& secret, - const bufferlist& in, bufferlist& out, std::string &error) -{ - const CK_MECHANISM_TYPE mechanism = CKM_AES_CBC_PAD; +class CryptoAESKeyHandler : public CryptoKeyHandler { +public: + CryptoPP::AES::Encryption *enc_key; + CryptoPP::AES::Decryption *dec_key; + + CryptoAESKeyHandler() + : enc_key(NULL), + dec_key(NULL) {} + ~CryptoAESKeyHandler() { + delete enc_key; + delete dec_key; + } - // sample source said this has to be at least size of input + 8, - // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN - bufferptr out_tmp(in.length()+16); + int init(const bufferptr& s, ostringstream& err) { + secret = s; - PK11SlotInfo *slot; + enc_key = new CryptoPP::AES::Encryption( + (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH); + dec_key = new CryptoPP::AES::Decryption( + (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH); - slot = PK11_GetBestSlot(mechanism, NULL); - if (!slot) { - ostringstream oss; - oss << "cannot find NSS slot to use: " << PR_GetError(); - error = oss.str(); - goto err; + return 0; } - SECItem keyItem; - - keyItem.type = siBuffer; - keyItem.data = (unsigned char*)secret.c_str(); - keyItem.len = secret.length(); - - PK11SymKey *key; + int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + string ciphertext; + CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext); + CryptoPP::CBC_Mode_ExternalCipher::Encryption cbc( + *enc_key, (const byte*)CEPH_AES_IV); + CryptoPP::StreamTransformationFilter stfEncryptor(cbc, sink); - key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT, - &keyItem, NULL); - if (!key) { - ostringstream oss; - oss << "cannot convert AES key for NSS: " << PR_GetError(); - error = oss.str(); - goto err_slot; + for (std::list::const_iterator it = in.buffers().begin(); + it != in.buffers().end(); ++it) { + const unsigned char *in_buf = (const unsigned char *)it->c_str(); + stfEncryptor.Put(in_buf, it->length()); + } + try { + stfEncryptor.MessageEnd(); + } catch (CryptoPP::Exception& e) { + if (error) { + ostringstream oss; + oss << "encryptor.MessageEnd::Exception: " << e.GetWhat(); + *error = oss.str(); + } + return -1; + } + out.append((const char *)ciphertext.c_str(), ciphertext.length()); + return 0; } - SECItem ivItem; - - ivItem.type = siBuffer; - // losing constness due to SECItem.data; IV should never be - // modified, regardless - ivItem.data = (unsigned char*)CEPH_AES_IV; - ivItem.len = sizeof(CEPH_AES_IV); + int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + string decryptedtext; + CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext); + CryptoPP::CBC_Mode_ExternalCipher::Decryption cbc( + *dec_key, (const byte*)CEPH_AES_IV ); + CryptoPP::StreamTransformationFilter stfDecryptor(cbc, sink); + for (std::list::const_iterator it = in.buffers().begin(); + it != in.buffers().end(); ++it) { + const unsigned char *in_buf = (const unsigned char *)it->c_str(); + stfDecryptor.Put(in_buf, it->length()); + } - SECItem *param; + try { + stfDecryptor.MessageEnd(); + } catch (CryptoPP::Exception& e) { + if (error) { + ostringstream oss; + oss << "decryptor.MessageEnd::Exception: " << e.GetWhat(); + *error = oss.str(); + } + return -1; + } - param = PK11_ParamFromIV(mechanism, &ivItem); - if (!param) { - ostringstream oss; - oss << "cannot set NSS IV param: " << PR_GetError(); - error = oss.str(); - goto err_key; + out.append((const char *)decryptedtext.c_str(), decryptedtext.length()); + return 0; } +}; - PK11Context *ctx; +#elif USE_NSS +// when we say AES, we mean AES-128 +# define AES_KEY_LEN 16 +# define AES_BLOCK_LEN 16 - ctx = PK11_CreateContextBySymKey(mechanism, op, key, param); - if (!ctx) { - ostringstream oss; - oss << "cannot create NSS context: " << PR_GetError(); - error = oss.str(); - goto err_param; - } +static int nss_aes_operation(CK_ATTRIBUTE_TYPE op, + CK_MECHANISM_TYPE mechanism, + PK11SymKey *key, + SECItem *param, + const bufferlist& in, bufferlist& out, + std::string *error) +{ + // sample source said this has to be at least size of input + 8, + // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN + bufferptr out_tmp(in.length()+16); + bufferlist incopy; SECStatus ret; int written; - // in is const, and PK11_CipherOp is not; C++ makes this hard to cheat, - // so just copy it to a temp buffer, at least for now - unsigned in_len; unsigned char *in_buf; - in_len = in.length(); - in_buf = (unsigned char*)malloc(in_len); - if (!in_buf) - throw std::bad_alloc(); - in.copy(0, in_len, (char*)in_buf); - ret = PK11_CipherOp(ctx, (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(), + + PK11Context *ectx; + ectx = PK11_CreateContextBySymKey(mechanism, op, key, param); + assert(ectx); + + incopy = in; // it's a shallow copy! + in_buf = (unsigned char*)incopy.c_str(); + ret = PK11_CipherOp(ectx, + (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(), in_buf, in.length()); - free(in_buf); if (ret != SECSuccess) { - ostringstream oss; - oss << "NSS AES failed: " << PR_GetError(); - error = oss.str(); - goto err_op; + PK11_DestroyContext(ectx, PR_TRUE); + if (error) { + ostringstream oss; + oss << "NSS AES failed: " << PR_GetError(); + *error = oss.str(); + } + return -1; } unsigned int written2; - ret = PK11_DigestFinal(ctx, (unsigned char*)out_tmp.c_str()+written, &written2, + ret = PK11_DigestFinal(ectx, + (unsigned char*)out_tmp.c_str()+written, &written2, out_tmp.length()-written); + PK11_DestroyContext(ectx, PR_TRUE); if (ret != SECSuccess) { - ostringstream oss; - oss << "NSS AES final round failed: " << PR_GetError(); - error = oss.str(); - goto err_op; + if (error) { + ostringstream oss; + oss << "NSS AES final round failed: " << PR_GetError(); + *error = oss.str(); + } + return -1; } out_tmp.set_length(written + written2); out.append(out_tmp); - - PK11_DestroyContext(ctx, PR_TRUE); - SECITEM_FreeItem(param, PR_TRUE); - PK11_FreeSymKey(key); - PK11_FreeSlot(slot); - return; - - err_op: - PK11_DestroyContext(ctx, PR_TRUE); - err_param: - SECITEM_FreeItem(param, PR_TRUE); - err_key: - PK11_FreeSymKey(key); - err_slot: - PK11_FreeSlot(slot); - err: - ; + return 0; } +class CryptoAESKeyHandler : public CryptoKeyHandler { + CK_MECHANISM_TYPE mechanism; + PK11SlotInfo *slot; + PK11SymKey *key; + SECItem *param; + +public: + CryptoAESKeyHandler() + : mechanism(CKM_AES_CBC_PAD), + slot(NULL), + key(NULL), + param(NULL) {} + ~CryptoAESKeyHandler() { + SECITEM_FreeItem(param, PR_TRUE); + PK11_FreeSymKey(key); + PK11_FreeSlot(slot); + } + + int init(const bufferptr& s, ostringstream& err) { + secret = s; + + slot = PK11_GetBestSlot(mechanism, NULL); + if (!slot) { + err << "cannot find NSS slot to use: " << PR_GetError(); + return -1; + } + + SECItem keyItem; + keyItem.type = siBuffer; + keyItem.data = (unsigned char*)secret.c_str(); + keyItem.len = secret.length(); + key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT, + &keyItem, NULL); + if (!key) { + err << "cannot convert AES key for NSS: " << PR_GetError(); + return -1; + } + + SECItem ivItem; + ivItem.type = siBuffer; + // losing constness due to SECItem.data; IV should never be + // modified, regardless + ivItem.data = (unsigned char*)CEPH_AES_IV; + ivItem.len = sizeof(CEPH_AES_IV); + + param = PK11_ParamFromIV(mechanism, &ivItem); + if (!param) { + err << "cannot set NSS IV param: " << PR_GetError(); + return -1; + } + + return 0; + } + + int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + return nss_aes_operation(CKA_ENCRYPT, mechanism, key, param, in, out, error); + } + int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + return nss_aes_operation(CKA_DECRYPT, mechanism, key, param, in, out, error); + } +}; + #else # error "No supported crypto implementation found." #endif + + +// ------------------------------------------------------------ + int CryptoAES::create(bufferptr& secret) { bufferlist bl; @@ -226,7 +338,7 @@ int CryptoAES::create(bufferptr& secret) return 0; } -int CryptoAES::validate_secret(bufferptr& secret) +int CryptoAES::validate_secret(const bufferptr& secret) { if (secret.length() < (size_t)AES_KEY_LEN) { return -EINVAL; @@ -235,140 +347,105 @@ int CryptoAES::validate_secret(bufferptr& secret) return 0; } -void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlist& out, - std::string &error) const +CryptoKeyHandler *CryptoAES::get_key_handler(const bufferptr& secret, + string& error) { - if (secret.length() < AES_KEY_LEN) { - error = "key is too short"; - return; - } -#ifdef USE_CRYPTOPP - { - const unsigned char *key = (const unsigned char *)secret.c_str(); - - string ciphertext; - CryptoPP::AES::Encryption aesEncryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH); - CryptoPP::CBC_Mode_ExternalCipher::Encryption cbcEncryption( aesEncryption, (const byte*)CEPH_AES_IV ); - CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext); - CryptoPP::StreamTransformationFilter stfEncryptor(cbcEncryption, sink); - - for (std::list::const_iterator it = in.buffers().begin(); - it != in.buffers().end(); ++it) { - const unsigned char *in_buf = (const unsigned char *)it->c_str(); - stfEncryptor.Put(in_buf, it->length()); - } - try { - stfEncryptor.MessageEnd(); - } catch (CryptoPP::Exception& e) { - ostringstream oss; - oss << "encryptor.MessageEnd::Exception: " << e.GetWhat(); - error = oss.str(); - return; - } - out.append((const char *)ciphertext.c_str(), ciphertext.length()); + CryptoAESKeyHandler *ckh = new CryptoAESKeyHandler; + ostringstream oss; + if (ckh->init(secret, oss) < 0) { + error = oss.str(); + return NULL; } -#elif USE_NSS - nss_aes_operation(CKA_ENCRYPT, secret, in, out, error); -#else -# error "No supported crypto implementation found." -#endif + return ckh; } -void CryptoAES::decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const -{ -#ifdef USE_CRYPTOPP - const unsigned char *key = (const unsigned char *)secret.c_str(); - CryptoPP::AES::Decryption aesDecryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH); - CryptoPP::CBC_Mode_ExternalCipher::Decryption cbcDecryption( aesDecryption, (const byte*)CEPH_AES_IV ); - string decryptedtext; - CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext); - CryptoPP::StreamTransformationFilter stfDecryptor(cbcDecryption, sink); - for (std::list::const_iterator it = in.buffers().begin(); - it != in.buffers().end(); ++it) { - const unsigned char *in_buf = (const unsigned char *)it->c_str(); - stfDecryptor.Put(in_buf, it->length()); - } - try { - stfDecryptor.MessageEnd(); - } catch (CryptoPP::Exception& e) { - ostringstream oss; - oss << "decryptor.MessageEnd::Exception: " << e.GetWhat(); - error = oss.str(); - return; - } - - out.append((const char *)decryptedtext.c_str(), decryptedtext.length()); -#elif USE_NSS - nss_aes_operation(CKA_DECRYPT, secret, in, out, error); -#else -# error "No supported crypto implementation found." -#endif -} +// -- // --------------------------------------------------- -int CryptoKey::set_secret(CephContext *cct, int type, bufferptr& s) -{ - this->type = type; - created = ceph_clock_now(cct); - CryptoHandler *h = cct->get_crypto_handler(type); - if (!h) { - lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl; - return -EOPNOTSUPP; - } - int ret = h->validate_secret(s); - - if (ret < 0) - return ret; +void CryptoKey::encode(bufferlist& bl) const +{ + ::encode(type, bl); + ::encode(created, bl); + __u16 len = secret.length(); + ::encode(len, bl); + bl.append(secret); +} - secret = s; +void CryptoKey::decode(bufferlist::iterator& bl) +{ + ::decode(type, bl); + ::decode(created, bl); + __u16 len; + ::decode(len, bl); + bufferptr tmp; + bl.copy(len, tmp); + if (_set_secret(type, tmp) < 0) + throw buffer::malformed_input("malformed secret"); +} +int CryptoKey::set_secret(int type, const bufferptr& s, utime_t c) +{ + int r = _set_secret(type, s); + if (r < 0) + return r; + this->created = c; return 0; } -int CryptoKey::create(CephContext *cct, int t) +int CryptoKey::_set_secret(int t, const bufferptr& s) { - type = t; - created = ceph_clock_now(cct); - - CryptoHandler *h = cct->get_crypto_handler(type); - if (!h) { - lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl; - return -EOPNOTSUPP; + if (s.length() == 0) { + secret = s; + ckh.reset(); + return 0; } - return h->create(secret); -} -void CryptoKey::encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const -{ - if (!ch || ch->get_type() != type) { - ch = cct->get_crypto_handler(type); - if (!ch) { - ostringstream oss; - oss << "CryptoKey::encrypt: key type " << type << " not supported."; - return; + CryptoHandler *ch = CryptoHandler::create(t); + if (ch) { + int ret = ch->validate_secret(s); + if (ret < 0) { + delete ch; + return ret; + } + string error; + ckh.reset(ch->get_key_handler(s, error)); + delete ch; + if (error.length()) { + return -EIO; } + } else { + return -EOPNOTSUPP; } - ch->encrypt(this->secret, in, out, error); + type = t; + secret = s; + return 0; } -void CryptoKey::decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const +int CryptoKey::create(CephContext *cct, int t) { - if (!ch || ch->get_type() != type) { - ch = cct->get_crypto_handler(type); - if (!ch) { - ostringstream oss; - oss << "CryptoKey::decrypt: key type " << type << " not supported."; - return; - } + CryptoHandler *ch = CryptoHandler::create(t); + if (!ch) { + if (cct) + lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << t << ") returned NULL" << dendl; + return -EOPNOTSUPP; } - ch->decrypt(this->secret, in, out, error); + bufferptr s; + int r = ch->create(s); + delete ch; + if (r < 0) + return r; + + r = _set_secret(t, s); + if (r < 0) + return r; + created = ceph_clock_now(cct); + return r; } void CryptoKey::print(std::ostream &out) const @@ -396,3 +473,18 @@ void CryptoKey::encode_plaintext(bufferlist &bl) { bl.append(encode_base64()); } + + +// ------------------ + +CryptoHandler *CryptoHandler::create(int type) +{ + switch (type) { + case CEPH_CRYPTO_NONE: + return new CryptoNone; + case CEPH_CRYPTO_AES: + return new CryptoAES; + default: + return NULL; + } +} diff --git a/ceph/src/auth/Crypto.h b/ceph/src/auth/Crypto.h index c8112220..3bfc5aab 100644 --- a/ceph/src/auth/Crypto.h +++ b/ceph/src/auth/Crypto.h @@ -17,6 +17,7 @@ #include "include/types.h" #include "include/utime.h" +#include "include/memory.h" #include "common/Formatter.h" #include "include/buffer.h" @@ -25,6 +26,22 @@ class CephContext; class CryptoHandler; +class CryptoKeyContext; + +/* + * some per-key context that is specific to a particular crypto backend + */ +class CryptoKeyHandler { +public: + bufferptr secret; + + virtual ~CryptoKeyHandler() {} + + virtual int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const = 0; + virtual int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const = 0; +}; /* * match encoding of struct ceph_secret @@ -33,38 +50,32 @@ class CryptoKey { protected: __u16 type; utime_t created; - bufferptr secret; + bufferptr secret; // must set this via set_secret()! - // cache a pointer to the handler, so we don't have to look it up - // for each crypto operation - mutable CryptoHandler *ch; + // cache a pointer to the implementation-specific key handler, so we + // don't have to create it for every crypto operation. + mutable ceph::shared_ptr ckh; + + int _set_secret(int type, const bufferptr& s); public: - CryptoKey() : type(0), ch(NULL) { } - CryptoKey(int t, utime_t c, bufferptr& s) : type(t), created(c), secret(s), ch(NULL) { } - - void encode(bufferlist& bl) const { - ::encode(type, bl); - ::encode(created, bl); - __u16 len = secret.length(); - ::encode(len, bl); - bl.append(secret); + CryptoKey() : type(0) { } + CryptoKey(int t, utime_t c, bufferptr& s) + : created(c) { + _set_secret(t, s); } - void decode(bufferlist::iterator& bl) { - ::decode(type, bl); - ::decode(created, bl); - __u16 len; - ::decode(len, bl); - bl.copy(len, secret); - secret.c_str(); // make sure it's a single buffer! + ~CryptoKey() { } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + int get_type() const { return type; } utime_t get_created() const { return created; } void print(std::ostream& out) const; - int set_secret(CephContext *cct, int type, bufferptr& s); - bufferptr& get_secret() { return secret; } + int set_secret(int type, const bufferptr& s, utime_t created); + const bufferptr& get_secret() { return secret; } const bufferptr& get_secret() const { return secret; } void encode_base64(string& s) const { @@ -94,8 +105,14 @@ public: // -- int create(CephContext *cct, int type); - void encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const; - void decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const; + int encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, + std::string *error) const { + return ckh->encrypt(in, out, error); + } + int decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, + std::string *error) const { + return ckh->decrypt(in, out, error); + } void to_str(std::string& s) const; }; @@ -119,44 +136,14 @@ public: virtual ~CryptoHandler() {} virtual int get_type() const = 0; virtual int create(bufferptr& secret) = 0; - virtual int validate_secret(bufferptr& secret) = 0; - virtual void encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const = 0; - virtual void decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const = 0; + virtual int validate_secret(const bufferptr& secret) = 0; + virtual CryptoKeyHandler *get_key_handler(const bufferptr& secret, + string& error) = 0; + + static CryptoHandler *create(int type); }; extern int get_random_bytes(char *buf, int len); extern uint64_t get_random(uint64_t min_val, uint64_t max_val); -class CryptoNone : public CryptoHandler { -public: - CryptoNone() { } - ~CryptoNone() {} - int get_type() const { - return CEPH_CRYPTO_NONE; - } - int create(bufferptr& secret); - int validate_secret(bufferptr& secret); - void encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; - void decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; -}; - -class CryptoAES : public CryptoHandler { -public: - CryptoAES() { } - ~CryptoAES() {} - int get_type() const { - return CEPH_CRYPTO_AES; - } - int create(bufferptr& secret); - int validate_secret(bufferptr& secret); - void encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; - void decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; -}; - #endif diff --git a/ceph/src/auth/cephx/CephxKeyServer.cc b/ceph/src/auth/cephx/CephxKeyServer.cc index b2c0c672..81c0a66b 100644 --- a/ceph/src/auth/cephx/CephxKeyServer.cc +++ b/ceph/src/auth/cephx/CephxKeyServer.cc @@ -268,7 +268,7 @@ bool KeyServer::generate_secret(CryptoKey& secret) if (crypto->create(bp) < 0) return false; - secret.set_secret(cct, CEPH_CRYPTO_AES, bp); + secret.set_secret(CEPH_CRYPTO_AES, bp, ceph_clock_now(NULL)); return true; } diff --git a/ceph/src/auth/cephx/CephxProtocol.h b/ceph/src/auth/cephx/CephxProtocol.h index d72a23d8..f08f07d8 100644 --- a/ceph/src/auth/cephx/CephxProtocol.h +++ b/ceph/src/auth/cephx/CephxProtocol.h @@ -433,8 +433,7 @@ void decode_decrypt_enc_bl(CephContext *cct, T& t, CryptoKey key, bufferlist& bl uint64_t magic; bufferlist bl; - key.decrypt(cct, bl_enc, bl, error); - if (!error.empty()) + if (key.decrypt(cct, bl_enc, bl, &error) < 0) return; bufferlist::iterator iter2 = bl.begin(); @@ -462,7 +461,7 @@ void encode_encrypt_enc_bl(CephContext *cct, const T& t, const CryptoKey& key, ::encode(magic, bl); ::encode(t, bl); - key.encrypt(cct, bl, out, error); + key.encrypt(cct, bl, out, &error); } template diff --git a/ceph/src/auth/cephx/CephxSessionHandler.cc b/ceph/src/auth/cephx/CephxSessionHandler.cc index b2d402d2..eaebd152 100644 --- a/ceph/src/auth/cephx/CephxSessionHandler.cc +++ b/ceph/src/auth/cephx/CephxSessionHandler.cc @@ -24,47 +24,65 @@ #define dout_subsys ceph_subsys_auth +int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig) +{ + const ceph_msg_header& header = m->get_header(); + const ceph_msg_footer& footer = m->get_footer(); + + // optimized signature calculation + // - avoid temporary allocated buffers from encode_encrypt[_enc_bl] + // - skip the leading 4 byte wrapper from encode_encrypt + struct { + __u8 v; + __le64 magic; + __le32 len; + __le32 header_crc; + __le32 front_crc; + __le32 middle_crc; + __le32 data_crc; + } __attribute__ ((packed)) sigblock = { + 1, AUTH_ENC_MAGIC, 4*4, + header.crc, footer.front_crc, footer.middle_crc, footer.data_crc + }; + bufferlist bl_plaintext; + bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock)); + + bufferlist bl_ciphertext; + if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) { + lderr(cct) << __func__ << " failed to encrypt signature block" << dendl; + return -1; + } + + bufferlist::iterator ci = bl_ciphertext.begin(); + ::decode(*psig, ci); + + ldout(cct, 10) << __func__ << " seq " << m->get_seq() + << " front_crc_ = " << footer.front_crc + << " middle_crc = " << footer.middle_crc + << " data_crc = " << footer.data_crc + << " sig = " << *psig + << dendl; + return 0; +} + int CephxSessionHandler::sign_message(Message *m) { // If runtime signing option is off, just return success without signing. if (!cct->_conf->cephx_sign_messages) { return 0; } - bufferlist bl_plaintext, bl_encrypted; - ceph_msg_header header = m->get_header(); - std::string error; - - ceph_msg_footer& en_footer = m->get_footer(); - - ::encode(header.crc, bl_plaintext); - ::encode(en_footer.front_crc, bl_plaintext); - ::encode(en_footer.middle_crc, bl_plaintext); - ::encode(en_footer.data_crc, bl_plaintext); - - ldout(cct, 10) << "sign_message: seq # " << header.seq << " CRCs are: header " << header.crc - << " front " << en_footer.front_crc << " middle " << en_footer.middle_crc - << " data " << en_footer.data_crc << dendl; - - if (encode_encrypt(cct, bl_plaintext, key, bl_encrypted, error)) { - ldout(cct, 0) << "error encrypting message signature: " << error << dendl; - ldout(cct, 0) << "no signature put on message" << dendl; - return SESSION_SIGNATURE_FAILURE; - } - - bufferlist::iterator ci = bl_encrypted.begin(); - // Skip the magic number up front. PLR - ci.advance(4); - ::decode(en_footer.sig, ci); - - // There's potentially an issue with whether the encoding and decoding done here will work - // properly when a big endian and little endian machine are talking. We think it's OK, - // but it should be tested to be sure. PLR - - // Receiver won't trust this flag to decide if msg should have been signed. It's primarily - // to debug problems where sender and receiver disagree on need to sign msg. PLR - en_footer.flags = (unsigned)en_footer.flags | CEPH_MSG_FOOTER_SIGNED; + + uint64_t sig; + int r = _calc_signature(m, &sig); + if (r < 0) + return r; + + ceph_msg_footer& f = m->get_footer(); + f.sig = sig; + f.flags = (unsigned)f.flags | CEPH_MSG_FOOTER_SIGNED; messages_signed++; - ldout(cct, 20) << "Putting signature in client message(seq # " << header.seq << "): sig = " << en_footer.sig << dendl; + ldout(cct, 20) << "Putting signature in client message(seq # " << m->get_seq() + << "): sig = " << sig << dendl; return 0; } @@ -74,57 +92,34 @@ int CephxSessionHandler::check_message_signature(Message *m) if (!cct->_conf->cephx_sign_messages) { return 0; } - - bufferlist bl_plaintext, bl_ciphertext; - std::string sig_error; - ceph_msg_header& header = m->get_header(); - ceph_msg_footer& footer = m->get_footer(); - if ((features & CEPH_FEATURE_MSG_AUTH) == 0) { // it's fine, we didn't negotiate this feature. return 0; } - signatures_checked++; + uint64_t sig; + int r = _calc_signature(m, &sig); + if (r < 0) + return r; - ldout(cct, 10) << "check_message_signature: seq # = " << m->get_seq() << " front_crc_ = " << footer.front_crc - << " middle_crc = " << footer.middle_crc << " data_crc = " << footer.data_crc << dendl; - ::encode(header.crc, bl_plaintext); - ::encode(footer.front_crc, bl_plaintext); - ::encode(footer.middle_crc, bl_plaintext); - ::encode(footer.data_crc, bl_plaintext); - - // Encrypt the buffer containing the checksums to calculate the signature. PLR - if (encode_encrypt(cct, bl_plaintext, key, bl_ciphertext, sig_error)) { - ldout(cct, 0) << "error in encryption for checking message signature: " << sig_error << dendl; - return (SESSION_SIGNATURE_FAILURE); - } - - bufferlist::iterator ci = bl_ciphertext.begin(); - // Skip the magic number at the front. PLR - ci.advance(4); - uint64_t sig_check; - ::decode(sig_check, ci); - - // There's potentially an issue with whether the encoding and decoding done here will work - // properly when a big endian and little endian machine are talking. We think it's OK, - // but it should be tested to be sure. PLR + signatures_checked++; - if (sig_check != footer.sig) { + if (sig != m->get_footer().sig) { // Should have been signed, but signature check failed. PLR - if (!(footer.flags & CEPH_MSG_FOOTER_SIGNED)) { - ldout(cct, 0) << "SIGN: MSG " << header.seq << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl; + if (!(m->get_footer().flags & CEPH_MSG_FOOTER_SIGNED)) { + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl; } - ldout(cct, 0) << "SIGN: MSG " << header.seq << " Message signature does not match contents." << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << "Signature on message:" << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << " sig: " << footer.sig << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << "Locally calculated signature:" << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << " sig_check:" << sig_check << dendl; - - // For the moment, printing an error message to the log and returning failure is sufficient. - // In the long term, we should probably have code parsing the log looking for this kind - // of security failure, particularly when there are large numbers of them, since the latter - // is a potential sign of an attack. PLR + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Message signature does not match contents." << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Signature on message:" << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " sig: " << m->get_footer().sig << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Locally calculated signature:" << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " sig_check:" << sig << dendl; + + // For the moment, printing an error message to the log and + // returning failure is sufficient. In the long term, we should + // probably have code parsing the log looking for this kind of + // security failure, particularly when there are large numbers of + // them, since the latter is a potential sign of an attack. PLR signatures_failed++; ldout(cct, 0) << "Signature failed." << dendl; diff --git a/ceph/src/auth/cephx/CephxSessionHandler.h b/ceph/src/auth/cephx/CephxSessionHandler.h index 52a112e2..7b46e076 100644 --- a/ceph/src/auth/cephx/CephxSessionHandler.h +++ b/ceph/src/auth/cephx/CephxSessionHandler.h @@ -31,8 +31,9 @@ public: return false; } - int sign_message(Message *m); + int _calc_signature(Message *m, uint64_t *psig); + int sign_message(Message *m); int check_message_signature(Message *m) ; // Cephx does not currently encrypt messages, so just return 0 if called. PLR diff --git a/ceph/src/ceph-disk b/ceph/src/ceph-disk index a32200cd..fa8e2e1e 100755 --- a/ceph/src/ceph-disk +++ b/ceph/src/ceph-disk @@ -59,7 +59,7 @@ We rely on /dev/disk/by-partuuid to find partitions by their UUID; this is what the journal symlink inside the osd data volume normally points to. -activate-all relies on /dev/disk/by-parttype-uuid/$typeuuid.$uuid to +activate-all relies on /dev/disk/by-parttypeuuid/$typeuuid.$uuid to find all partitions. We install special udev rules to create these links. @@ -2440,6 +2440,28 @@ def get_partition_type(part): def get_partition_uuid(dev): + # + # blkid is prefered + # + what = 'ID_PART_ENTRY_UUID' + out, _ = command( + [ + 'blkid', + '-o', + 'udev', + '-p', + dev, + ] + ) + p = {} + for line in out.splitlines(): + (key, value) = line.split('=') + p[key] = value + if what in p: + return p[what] + # + # if blkid does not deliver, fallback to sgdisk + # (base, partnum) = split_dev_base_partnum(dev) out, _ = command(['sgdisk', '-i', partnum, base]) for line in out.splitlines(): diff --git a/ceph/src/ceph.in b/ceph/src/ceph.in index 9f857ec6..54140126 100755 --- a/ceph/src/ceph.in +++ b/ceph/src/ceph.in @@ -447,6 +447,10 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose): print >> sys.stderr, \ 'error handling command target: {0}'.format(e) return 1, '', '' + if len(cmdargs) and cmdargs[0] == 'tell': + print >> sys.stderr, \ + 'Can not use \'tell\' in interactive mode.' + continue valid_dict = validate_command(sigdict, cmdargs, verbose) if valid_dict: if parsed_args.output_format: diff --git a/ceph/src/ceph_fuse.cc b/ceph/src/ceph_fuse.cc index 4d46639b..84d4128b 100644 --- a/ceph/src/ceph_fuse.cc +++ b/ceph/src/ceph_fuse.cc @@ -31,6 +31,7 @@ using namespace std; #include "common/Timer.h" #include "common/ceph_argparse.h" #include "global/global_init.h" +#include "global/signal_handler.h" #include "common/safe_io.h" #ifndef DARWIN @@ -211,6 +212,9 @@ int main(int argc, const char **argv, const char *envp[]) { goto out_client_unmount; } + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + cerr << "ceph-fuse[" << getpid() << "]: starting fuse" << std::endl; tester.init(cfuse, client); tester.create(); @@ -249,6 +253,9 @@ int main(int argc, const char **argv, const char *envp[]) { free(newargv); delete mc; + + unregister_async_signal_handler(SIGHUP, sighup_handler); + shutdown_async_signal_handler(); //cout << "child done" << std::endl; return r; diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc index 0d85db29..717c4d2b 100644 --- a/ceph/src/client/Client.cc +++ b/ceph/src/client/Client.cc @@ -90,6 +90,11 @@ using namespace std; #include "include/assert.h" #include "include/stat.h" +#if HAVE_GETGROUPLIST +#include +#include +#endif + #undef dout_prefix #define dout_prefix *_dout << "client." << whoami << " " @@ -148,7 +153,6 @@ dir_result_t::dir_result_t(Inode *in) : inode(in), offset(0), this_offset(2), next_offset(2), release_count(0), ordered_count(0), start_shared_gen(0), buffer(0) { - inode->get(); } // cons/des @@ -185,8 +189,6 @@ Client::Client(Messenger *m, MonClient *mc) last_tid = 0; last_flush_seq = 0; - cwd = NULL; - // root = 0; @@ -272,11 +274,8 @@ void Client::tear_down_cache() delete root; root = 0; root_ancestor = 0; - while (!root_parents.empty()) { - Inode *in = root_parents.begin()->second; + while (!root_parents.empty()) root_parents.erase(root_parents.begin()); - delete in; - } inode_map.clear(); } @@ -330,7 +329,7 @@ void Client::dump_inode(Formatter *f, Inode *in, set& did, bool disconne f->close_section(); } if (it->second->inode) - dump_inode(f, it->second->inode, did, false); + dump_inode(f, it->second->inode.get(), did, false); } } } @@ -567,11 +566,8 @@ void Client::trim_cache() delete root; root = 0; root_ancestor = 0; - while (!root_parents.empty()) { - Inode *in = root_parents.begin()->second; + while (!root_parents.empty()) root_parents.erase(root_parents.begin()); - delete in; - } inode_map.clear(); } } @@ -740,17 +736,15 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, in = inode_map[st->vino]; ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl; } else { - in = new Inode(cct, st->vino, &st->layout); + in = new Inode(this, st->vino, &st->layout); inode_map[st->vino] = in; if (!root) { root = in; root_ancestor = in; cwd = root; - cwd->get(); } else if (!mounted) { root_parents[root_ancestor] = in; root_ancestor = in; - in->get(); } // immutable bits @@ -892,8 +886,8 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl } } - if (!dn || dn->inode == 0) { - in->get(); + if (!dn || !dn->inode) { + InodeRef tmp_ref(in); if (old_dentry) { if (old_dentry->dir != dir) { old_dentry->dir->ordered_count++; @@ -911,7 +905,6 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl dir->parent_inode->flags &= ~I_DIR_ORDERED; } dn = link(dir, dname, in, dn); - put_inode(in); } update_dentry_lease(dn, dlease, from, session); @@ -1059,8 +1052,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dn->offset = dir_result_t::make_fpos(fg, i + readdir_offset); // add to cached result list - in->get(); - request->readdir_result.push_back(pair(dname, in)); + request->readdir_result.push_back(pair(dname, in)); ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl; } @@ -1245,7 +1237,7 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req) } } else if (de) { if (de->inode) { - in = de->inode; + in = de->inode.get(); ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl; } else { in = de->dir->parent_inode; @@ -1261,12 +1253,12 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req) ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl; while (in->snapid != CEPH_NOSNAP) { if (in->snapid == CEPH_SNAPDIR) - in = in->snapdir_parent; + in = in->snapdir_parent.get(); else if (!in->dn_set.empty()) /* In most cases there will only be one dentry, so getting it * will be the correct action. If there are multiple hard links, * I think the MDS should be able to redirect as needed*/ - in = in->get_first_parent()->dir->parent_inode; + in = in->get_first_parent()->dir->parent_inode; else { ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl; break; @@ -1353,7 +1345,7 @@ void Client::dump_mds_requests(Formatter *f) int Client::verify_reply_trace(int r, MetaRequest *request, MClientReply *reply, - Inode **ptarget, bool *pcreated, + InodeRef *ptarget, bool *pcreated, int uid, int gid) { // check whether this request actually did the create, and set created flag @@ -1376,17 +1368,17 @@ int Client::verify_reply_trace(int r, *pcreated = got_created_ino; if (request->target) { - *ptarget = request->target; - ldout(cct, 20) << "make_request target is " << *request->target << dendl; + ptarget->swap(request->target); + ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl; } else { if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) { (*ptarget) = p->second; - ldout(cct, 20) << "make_request created, target is " << **ptarget << dendl; + ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl; } else { // we got a traceless reply, and need to look up what we just // created. for now, do this by name. someday, do this by the // ino... which we know! FIXME. - Inode *target = 0; // ptarget may be NULL + InodeRef target; Dentry *d = request->dentry(); if (d) { if (d->dir) { @@ -1408,15 +1400,14 @@ int Client::verify_reply_trace(int r, target = in; } if (r >= 0) { - if (ptarget) - *ptarget = target; - // verify ino returned in reply and trace_dist are the same if (got_created_ino && created_ino.val != target->ino.val) { ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl; r = -EINTR; } + if (ptarget) + ptarget->swap(target); } } } @@ -1447,7 +1438,7 @@ int Client::verify_reply_trace(int r, */ int Client::make_request(MetaRequest *request, int uid, int gid, - Inode **ptarget, bool *pcreated, + InodeRef *ptarget, bool *pcreated, int use_mds, bufferlist *pdirbl) { @@ -1582,15 +1573,8 @@ int Client::make_request(MetaRequest *request, void Client::put_request(MetaRequest *request) { - if (request->_put()) { - if (request->inode()) - put_inode(request->take_inode()); - if (request->old_inode()) - put_inode(request->take_old_inode()); - if (request->other_inode()) - put_inode(request->take_other_inode()); + if (request->_put()) delete request; - } } int Client::encode_inode_release(Inode *in, MetaRequest *req, @@ -2476,19 +2460,15 @@ void Client::put_inode(Inode *in, int n) bool unclean = objectcacher->release_set(&in->oset); assert(!unclean); put_qtree(in); - if (in->snapdir_parent) - put_inode(in->snapdir_parent); inode_map.erase(in->vino()); in->cap_item.remove_myself(); in->snaprealm_item.remove_myself(); + in->snapdir_parent.reset(); if (in == root) { root = 0; root_ancestor = 0; - while (!root_parents.empty()) { - Inode *in = root_parents.begin()->second; + while (!root_parents.empty()) root_parents.erase(root_parents.begin()); - put_inode(in); - } } if (!in->oset.objects.empty()) { @@ -2546,7 +2526,6 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) if (in) { // link to inode dn->inode = in; - in->get(); if (in->is_dir()) { if (in->dir) dn->get(); // dir -> dn pin @@ -2573,13 +2552,14 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) { - Inode *in = dn->inode; + InodeRef in; + in.swap(dn->inode); ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn << " inode " << dn->inode << dendl; // unlink from inode if (in) { - invalidate_quota_tree(in); + invalidate_quota_tree(in.get()); if (in->is_dir()) { if (in->dir) dn->put(); // dir -> dn pin @@ -2590,7 +2570,6 @@ void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) assert(in->dn_set.count(dn)); in->dn_set.erase(dn); ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl; - put_inode(in); } if (keepdentry) { @@ -2611,7 +2590,6 @@ void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) } } - /**** * caps */ @@ -2966,7 +2944,6 @@ void Client::queue_cap_snap(Inode *in, snapid_t seq) } else if (in->caps_dirty() || (used & CEPH_CAP_FILE_WR) || (dirty & CEPH_CAP_ANY_WR)) { - in->get(); CapSnap *capsnap = new CapSnap(in); in->cap_snaps[seq] = capsnap; capsnap->context = in->snaprealm->get_snap_context(); @@ -3130,13 +3107,12 @@ void Client::wake_inode_waiters(MetaSession *s) class C_Client_CacheInvalidate : public Context { private: Client *client; - Inode *inode; + InodeRef inode; int64_t offset, length; bool keep_caps; public: C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len, bool keep) : client(c), inode(in), offset(off), length(len), keep_caps(keep) { - inode->get(); } void finish(int r) { // _async_invalidate takes the lock when it needs to, call this back from outside of lock. @@ -3145,15 +3121,15 @@ public: } }; -void Client::_async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps) +void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps) { ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << dendl; ino_invalidate_cb(callback_handle, in->vino(), off, len); client_lock.Lock(); if (!keep_caps) - check_caps(in, false); - put_inode(in); + check_caps(in.get(), false); + in.reset(); // put inode inside client_lock client_lock.Unlock(); ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << " done" << dendl; } @@ -3482,11 +3458,13 @@ void Client::_invalidate_kernel_dcache() void Client::trim_caps(MetaSession *s, int max) { mds_rank_t mds = s->mds_num; - ldout(cct, 10) << "trim_caps mds." << mds << " max " << max << dendl; + int caps_size = s->caps.size(); + ldout(cct, 10) << "trim_caps mds." << mds << " max " << max + << " caps " << caps_size << dendl; int trimmed = 0; xlist::iterator p = s->caps.begin(); - while ((s->caps.size() - trimmed) > max && !p.end()) { + while ((caps_size - trimmed) > max && !p.end()) { Cap *cap = *p; s->s_cap_iterator = cap; Inode *in = cap->inode; @@ -3504,7 +3482,7 @@ void Client::trim_caps(MetaSession *s, int max) ldout(cct, 20) << " trying to trim dentries for " << *in << dendl; bool all = true; set::iterator q = in->dn_set.begin(); - in->get(); + InodeRef tmp_ref(in); while (q != in->dn_set.end()) { Dentry *dn = *q++; if (dn->lru_is_expireable()) { @@ -3525,8 +3503,6 @@ void Client::trim_caps(MetaSession *s, int max) ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl; trimmed++; } - - put_inode(in); } ++p; @@ -3641,10 +3617,10 @@ void Client::kick_flushing_caps(MetaSession *session) for (xlist::iterator p = session->flushing_capsnaps.begin(); !p.end(); ++p) { CapSnap *capsnap = *p; - Inode *in = capsnap->in; + InodeRef& in = capsnap->in; ldout(cct, 20) << " reflushing capsnap " << capsnap << " on " << *in << " to mds." << mds << dendl; - flush_snaps(in, false, capsnap); + flush_snaps(in.get(), false, capsnap); } for (xlist::iterator p = session->flushing_caps.begin(); !p.end(); ++p) { Inode *in = *p; @@ -4165,10 +4141,9 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa } else { ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows << " on " << *in << dendl; + in->cap_snaps.erase(follows); capsnap->flushing_item.remove_myself(); delete capsnap; - in->cap_snaps.erase(follows); - put_inode(in); } } else { ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows @@ -4249,13 +4224,10 @@ void Client::_try_to_trim_inode(Inode *in) class C_Client_FlushComplete : public Context { private: Client *client; - Inode *inode; + InodeRef inode; public: - C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) - { - inode->get(); - } + C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { } void finish(int r) { assert(client->client_lock.is_locked_by_me()); @@ -4266,7 +4238,6 @@ class C_Client_FlushComplete : public Context { << ": " << r << "(" << cpp_strerror(r) << ")" << dendl; inode->async_err = r; } - client->put_inode(inode); } }; @@ -4386,17 +4357,54 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid) gid_t *sgids = NULL; int sgid_count = 0; if (getgroups_cb) { - sgid_count = getgroups_cb(callback_handle, uid, &sgids); - if (sgid_count < 0) { + sgid_count = getgroups_cb(callback_handle, &sgids); + if (sgid_count > 0) { ldout(cct, 3) << "getgroups failed!" << dendl; - return sgid_count; } } +#if HAVE_GETGROUPLIST + if (sgid_count <= 0) { + // use PAM to get the group list + // initial number of group entries, defaults to posix standard of 16 + // PAM implementations may provide more than 16 groups.... + sgid_count = 16; + sgids = (gid_t*)malloc(sgid_count * sizeof(gid_t)); + if (sgids == NULL) { + ldout(cct, 3) << "allocating group memory failed" << dendl; + return -EACCES; + } + struct passwd *pw; + pw = getpwuid(uid); + if (pw == NULL) { + ldout(cct, 3) << "getting user entry failed" << dendl; + return -EACCES; + } + while (1) { + if (getgrouplist(pw->pw_name, gid, sgids, &sgid_count) == -1) { + // we need to resize the group list and try again + void *_realloc = NULL; + if ((_realloc = realloc(sgids, sgid_count * sizeof(gid_t))) == NULL) { + ldout(cct, 3) << "allocating group memory failed" << dendl; + free(sgids); + return -EACCES; + } + sgids = (gid_t*)_realloc; + continue; + } + // list was successfully retrieved + break; + } + } +#endif + // check permissions before doing anything else + int ret = 0; if (uid != 0 && !in->check_mode(uid, gid, sgids, sgid_count, flags)) { - return -EACCES; + ret = -EACCES; } - return 0; + if (sgids) + free(sgids); + return ret; } vinodeno_t Client::_get_vino(Inode *in) @@ -4714,9 +4722,7 @@ void Client::unmount() timer.cancel_event(tick_event); tick_event = 0; - if (cwd) - put_inode(cwd); - cwd = NULL; + cwd.reset(); // clean up any unclosed files while (!fd_map.empty()) { @@ -4747,10 +4753,9 @@ void Client::unmount() assert(in); } if (!in->caps.empty()) { - in->get(); + InodeRef tmp_ref(in); _release(in); _flush(in, new C_Client_FlushComplete(this, in)); - put_inode(in); } } } @@ -4914,7 +4919,7 @@ void Client::renew_caps(MetaSession *session) // =============================================================== // high level (POSIXy) interface -int Client::_do_lookup(Inode *dir, const string& name, Inode **target) +int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target) { int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; MetaRequest *req = new MetaRequest(op); @@ -4931,7 +4936,7 @@ int Client::_do_lookup(Inode *dir, const string& name, Inode **target) return r; } -int Client::_lookup(Inode *dir, const string& dname, Inode **target) +int Client::_lookup(Inode *dir, const string& dname, InodeRef *target) { int r = 0; Dentry *dn = NULL; @@ -5058,10 +5063,10 @@ int Client::get_or_create(Inode *dir, const char* name, return 0; } -int Client::path_walk(const filepath& origpath, Inode **final, bool followsym) +int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym) { filepath path = origpath; - Inode *cur; + InodeRef cur; if (origpath.absolute()) cur = root; else @@ -5077,8 +5082,8 @@ int Client::path_walk(const filepath& origpath, Inode **final, bool followsym) const string &dname = path[i]; ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl; ldout(cct, 20) << " (path is " << path << ")" << dendl; - Inode *next; - int r = _lookup(cur, dname, &next); + InodeRef next; + int r = _lookup(cur.get(), dname, &next); if (r < 0) return r; // only follow trailing symlink if followsym. always follow @@ -5118,13 +5123,13 @@ int Client::path_walk(const filepath& origpath, Inode **final, bool followsym) continue; } } - cur = next; + cur.swap(next); i++; } if (!cur) return -ENOENT; - if (final) - *final = cur; + if (end) + end->swap(cur); return 0; } @@ -5143,18 +5148,15 @@ int Client::link(const char *relexisting, const char *relpath) string name = path.last_dentry(); path.pop_dentry(); - Inode *in, *dir; + InodeRef in, dir; int r; r = path_walk(existing, &in); if (r < 0) goto out; - in->get(); r = path_walk(path, &dir); if (r < 0) - goto out_unlock; - r = _link(in, dir, name.c_str()); - out_unlock: - put_inode(in); + goto out; + r = _link(in.get(), dir.get(), name.c_str()); out: return r; } @@ -5168,11 +5170,11 @@ int Client::unlink(const char *relpath) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) return r; - return _unlink(dir, name.c_str()); + return _unlink(dir.get(), name.c_str()); } int Client::rename(const char *relfrom, const char *relto) @@ -5189,21 +5191,16 @@ int Client::rename(const char *relfrom, const char *relto) string toname = to.last_dentry(); to.pop_dentry(); - Inode *fromdir, *todir; + InodeRef fromdir, todir; int r; r = path_walk(from, &fromdir); if (r < 0) goto out; - fromdir->get(); r = path_walk(to, &todir); if (r < 0) - goto out_unlock; - todir->get(); - r = _rename(fromdir, fromname.c_str(), todir, toname.c_str()); - put_inode(todir); - out_unlock: - put_inode(fromdir); + goto out; + r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str()); out: return r; } @@ -5221,12 +5218,12 @@ int Client::mkdir(const char *relpath, mode_t mode) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) { return r; } - return _mkdir(dir, name.c_str(), mode); + return _mkdir(dir.get(), name.c_str(), mode); } int Client::mkdirs(const char *relpath, mode_t mode) @@ -5241,12 +5238,12 @@ int Client::mkdirs(const char *relpath, mode_t mode) filepath path(relpath); unsigned int i; int r=0; - Inode *cur = cwd; - Inode *next; + InodeRef cur, next; + cur = cwd; for (i=0; iino).get_path() << dendl; } @@ -5280,11 +5277,11 @@ int Client::rmdir(const char *relpath) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) return r; - return _rmdir(dir, name.c_str()); + return _rmdir(dir.get(), name.c_str()); } int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) @@ -5297,11 +5294,11 @@ int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - return _mknod(in, name.c_str(), mode, rdev); + return _mknod(in.get(), name.c_str(), mode, rdev); } // symlinks @@ -5316,11 +5313,11 @@ int Client::symlink(const char *target, const char *relpath) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) return r; - return _symlink(dir, name.c_str(), target); + return _symlink(dir.get(), name.c_str(), target); } int Client::readlink(const char *relpath, char *buf, loff_t size) @@ -5330,12 +5327,12 @@ int Client::readlink(const char *relpath, char *buf, loff_t size) tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in, false); if (r < 0) return r; - return _readlink(in, buf, size); + return _readlink(in.get(), buf, size); } int Client::_readlink(Inode *in, char *buf, size_t size) @@ -5375,7 +5372,7 @@ int Client::_getattr(Inode *in, int mask, int uid, int gid, bool force) } int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid, - Inode **inp) + InodeRef *inp) { int issued = in->caps_issued(); @@ -5504,11 +5501,11 @@ int Client::setattr(const char *relpath, struct stat *attr, int mask) tout(cct) << mask << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - return _setattr(in, attr, mask); + return _setattr(in, attr, mask); } int Client::fsetattr(int fd, struct stat *attr, int mask) @@ -5525,7 +5522,7 @@ int Client::fsetattr(int fd, struct stat *attr, int mask) if (f->flags & O_PATH) return -EBADF; #endif - return _setattr(f->inode, attr, mask); + return _setattr(f->inode, attr, mask); } int Client::stat(const char *relpath, struct stat *stbuf, @@ -5536,7 +5533,7 @@ int Client::stat(const char *relpath, struct stat *stbuf, tout(cct) << "stat" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5558,7 +5555,7 @@ int Client::lstat(const char *relpath, struct stat *stbuf, tout(cct) << "lstat" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5627,7 +5624,7 @@ int Client::chmod(const char *relpath, mode_t mode) tout(cct) << relpath << std::endl; tout(cct) << mode << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5661,7 +5658,7 @@ int Client::lchmod(const char *relpath, mode_t mode) tout(cct) << relpath << std::endl; tout(cct) << mode << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5679,7 +5676,7 @@ int Client::chown(const char *relpath, int uid, int gid) tout(cct) << uid << std::endl; tout(cct) << gid << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5723,7 +5720,7 @@ int Client::lchown(const char *relpath, int uid, int gid) tout(cct) << uid << std::endl; tout(cct) << gid << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5745,7 +5742,7 @@ int Client::utime(const char *relpath, struct utimbuf *buf) tout(cct) << buf->modtime << std::endl; tout(cct) << buf->actime << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5765,7 +5762,7 @@ int Client::lutime(const char *relpath, struct utimbuf *buf) tout(cct) << buf->modtime << std::endl; tout(cct) << buf->actime << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5784,11 +5781,11 @@ int Client::opendir(const char *relpath, dir_result_t **dirpp) tout(cct) << "opendir" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - r = _opendir(in, dirpp); + r = _opendir(in.get(), dirpp); tout(cct) << (unsigned long)*dirpp << std::endl; return r; } @@ -5826,8 +5823,7 @@ void Client::_closedir(dir_result_t *dirp) ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl; if (dirp->inode) { ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl; - put_inode(dirp->inode); - dirp->inode = 0; + dirp->inode.reset(); } _readdir_drop_dirp_buffer(dirp); delete dirp; @@ -5928,8 +5924,6 @@ void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp) { ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl; if (dirp->buffer) { - for (unsigned i = 0; i < dirp->buffer->size(); i++) - put_inode((*dirp->buffer)[i].second); delete dirp->buffer; dirp->buffer = NULL; } @@ -5951,13 +5945,13 @@ int Client::_readdir_get_frag(dir_result_t *dirp) if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR) op = CEPH_MDS_OP_LSSNAP; - Inode *diri = dirp->inode; + InodeRef& diri = dirp->inode; MetaRequest *req = new MetaRequest(op); filepath path; diri->make_nosnap_relative_path(path); req->set_filepath(path); - req->set_inode(diri); + req->set_inode(diri.get()); req->head.args.readdir.frag = fg; if (dirp->last_name.length()) { req->path2.set_path(dirp->last_name.c_str()); @@ -5982,7 +5976,7 @@ int Client::_readdir_get_frag(dir_result_t *dirp) _readdir_drop_dirp_buffer(dirp); - dirp->buffer = new vector >; + dirp->buffer = new vector >; dirp->buffer->swap(req->readdir_result); if (fg != req->readdir_reply_frag) { @@ -6061,7 +6055,7 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) struct stat st; struct dirent de; - int stmask = fill_stat(dn->inode, &st); + int stmask = fill_stat(dn->inode, &st); fill_dirent(&de, dn->name.c_str(), st.st_mode, st.st_ino, dirp->offset + 1); uint64_t next_off = dn->offset + 1; @@ -6112,7 +6106,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) frag_t fg = dirp->frag(); uint32_t off = dirp->fragpos(); - Inode *diri = dirp->inode; + InodeRef& diri = dirp->inode; if (dirp->at_end()) return 0; @@ -6140,7 +6134,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) if (dirp->offset == 1) { ldout(cct, 15) << " including .." << dendl; if (!diri->dn_set.empty()) { - Inode* in = diri->get_first_parent()->inode; + InodeRef& in = diri->get_first_parent()->inode; fill_dirent(&de, "..", S_IFDIR, in->ino, 2); fill_stat(in, &st); } else { @@ -6201,9 +6195,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) dirp->offset = dir_result_t::make_fpos(fg, off); while (off >= dirp->this_offset && off - dirp->this_offset < dirp->buffer->size()) { - pair& ent = (*dirp->buffer)[off - dirp->this_offset]; + pair& ent = (*dirp->buffer)[off - dirp->this_offset]; - int stmask = fill_stat(ent.second, &st); + int stmask = fill_stat(ent.second, &st); fill_dirent(&de, ent.first.c_str(), st.st_mode, st.st_ino, dirp->offset + 1); client_lock.Unlock(); @@ -6455,7 +6449,7 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit, #endif filepath path(relpath); - Inode *in; + InodeRef in; bool created = false; /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */ bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL))); @@ -6475,11 +6469,11 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit, filepath dirpath = path; string dname = dirpath.last_dentry(); dirpath.pop_dentry(); - Inode *dir; + InodeRef dir; r = path_walk(dirpath, &dir); if (r < 0) return r; - r = _create(dir, dname.c_str(), flags, mode, &in, &fh, stripe_unit, + r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit, stripe_count, object_size, data_pool, &created); } if (r < 0) @@ -6489,17 +6483,16 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit, // posix says we can only check permissions of existing files uid_t uid = geteuid(); gid_t gid = getegid(); - r = check_permissions(in, flags, uid, gid); + r = check_permissions(in.get(), flags, uid, gid); if (r < 0) goto out; } if (!fh) - r = _open(in, flags, mode, &fh); + r = _open(in.get(), flags, mode, &fh); if (r >= 0) { // allocate a integer file descriptor assert(fh); - assert(in); r = get_fd(); assert(fd_map.count(r) == 0); fd_map[r] = fh; @@ -6589,11 +6582,12 @@ int Client::lookup_parent(Inode *ino, Inode **parent) req->set_filepath(path); req->set_inode(ino); - int r = make_request(req, -1, -1, NULL, NULL, rand() % mdsmap->get_num_in_mds()); + InodeRef target; + int r = make_request(req, -1, -1, &target, NULL, rand() % mdsmap->get_num_in_mds()); // Give caller a reference to the parent ino if they provided a pointer. if (parent != NULL) { if (r == 0) { - *parent = req->target; + *parent = target.get(); _ll_get(*parent); ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl; } else { @@ -6637,7 +6631,6 @@ Fh *Client::_create_fh(Inode *in, int flags, int cmode) // inode assert(in); f->inode = in; - f->inode->get(); ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl; @@ -6671,7 +6664,7 @@ int Client::_release_fh(Fh *f) { //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl; - Inode *in = f->inode; + Inode *in = f->inode.get(); ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl; if (in->snapid == CEPH_NOSNAP) { @@ -6709,10 +6702,8 @@ int Client::_release_fh(Fh *f) void Client::_put_fh(Fh *f) { int left = f->put(); - if (!left) { - put_inode(f->inode); + if (!left) delete f; - } } int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid) @@ -6800,7 +6791,7 @@ loff_t Client::lseek(int fd, loff_t offset, int whence) loff_t Client::_lseek(Fh *f, loff_t offset, int whence) { - Inode *in = f->inode; + Inode *in = f->inode.get(); int r; switch (whence) { @@ -6931,7 +6922,7 @@ int Client::read(int fd, char *buf, loff_t size, loff_t offset) int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl) { const md_config_t *conf = cct->_conf; - Inode *in = f->inode; + Inode *in = f->inode.get(); //bool lazy = f->mode == CEPH_FILE_MODE_LAZY; @@ -7065,14 +7056,14 @@ Client::C_Readahead::~C_Readahead() { void Client::C_Readahead::finish(int r) { lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl; - client->put_cap_ref(f->inode, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); + client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); f->readahead.dec_pending(); } int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl) { const md_config_t *conf = cct->_conf; - Inode *in = f->inode; + Inode *in = f->inode.get(); ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl; @@ -7136,7 +7127,7 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl) int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, bool *checkeof) { - Inode *in = f->inode; + Inode *in = f->inode.get(); uint64_t pos = off; int left = len; int read = 0; @@ -7207,11 +7198,9 @@ int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, */ class C_Client_SyncCommit : public Context { Client *cl; - Inode *in; + InodeRef in; public: - C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) { - in->get(); - } + C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) {} void finish(int) { // Called back by Filter, then Client is responsible for taking its own lock assert(!cl->client_lock.is_locked_by_me()); @@ -7219,14 +7208,14 @@ public: } }; -void Client::sync_write_commit(Inode *in) +void Client::sync_write_commit(InodeRef& in) { Mutex::Locker l(client_lock); assert(unsafe_sync_write > 0); unsafe_sync_write--; - put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + put_cap_ref(in.get(), CEPH_CAP_FILE_BUFFER); ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl; if (unsafe_sync_write == 0 && unmounting) { @@ -7234,7 +7223,7 @@ void Client::sync_write_commit(Inode *in) mount_cond.Signal(); } - put_inode(in); + in.reset(); // put inode inside client_lock } int Client::write(int fd, const char *buf, loff_t size, loff_t offset) @@ -7268,7 +7257,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) } //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; - Inode *in = f->inode; + Inode *in = f->inode.get(); assert(in->snapid == CEPH_NOSNAP); @@ -7466,7 +7455,7 @@ done: int Client::_flush(Fh *f) { - Inode *in = f->inode; + Inode *in = f->inode.get(); int err = in->async_err; if (err != 0) { ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = " @@ -7526,20 +7515,20 @@ int Client::fsync(int fd, bool syncdataonly) int Client::_fsync(Fh *f, bool syncdataonly) { int r = 0; - - Inode *in = f->inode; + Inode *in = f->inode.get(); ceph_tid_t wait_on_flush = 0; bool flushed_metadata = false; Mutex lock("Client::_fsync::lock"); Cond cond; bool done = false; C_SafeCond *object_cacher_completion = NULL; + InodeRef tmp_ref; ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl; if (cct->_conf->client_oc) { object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r); - in->get(); // take a reference; C_SafeCond doesn't and _flush won't either + tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either _flush(in, object_cacher_completion); ldout(cct, 15) << "using return-valued form of _fsync" << dendl; } @@ -7564,7 +7553,6 @@ int Client::_fsync(Fh *f, bool syncdataonly) cond.Wait(lock); lock.Unlock(); client_lock.Lock(); - put_inode(in); ldout(cct, 15) << "got " << r << " from flush writeback" << dendl; } else { // FIXME: this can starve @@ -7621,15 +7609,12 @@ int Client::chdir(const char *relpath) tout(cct) << "chdir" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - if (cwd != in) { - in->get(); - put_inode(cwd); - cwd = in; - } + if (cwd != in) + cwd.swap(in); ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl; return 0; } @@ -7639,7 +7624,7 @@ void Client::getcwd(string& dir) filepath path; ldout(cct, 10) << "getcwd " << *cwd << dendl; - Inode *in = cwd; + Inode *in = cwd.get(); while (in != root) { assert(in->dn_set.size() < 2); // dirs can't be hard-linked Dentry *dn = in->get_first_parent(); @@ -7656,7 +7641,7 @@ void Client::getcwd(string& dir) // start over path = filepath(); - in = cwd; + in = cwd.get(); continue; } path.push_front_dentry(dn->name); @@ -7877,7 +7862,7 @@ void Client::_release_filelocks(Fh *fh) if (!fh->fcntl_locks && !fh->flock_locks) return; - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl; list > to_release; @@ -7948,7 +7933,7 @@ void Client::_update_lock_state(struct flock *fl, uint64_t owner, int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner) { - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl; int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner); return ret; @@ -7956,7 +7941,7 @@ int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner) int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req) { - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl; int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner, fuse_req); ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl; @@ -7965,7 +7950,7 @@ int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fu int Client::_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req) { - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl; int sleep = !(cmd & LOCK_NB); @@ -8119,7 +8104,7 @@ int Client::lazyio_synchronize(int fd, loff_t offset, size_t count) Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); _fsync(f, true); _release(in); @@ -8134,22 +8119,22 @@ int Client::mksnap(const char *relpath, const char *name) { Mutex::Locker l(client_lock); filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - Inode *snapdir = open_snapdir(in); + Inode *snapdir = open_snapdir(in.get()); return _mkdir(snapdir, name, 0); } int Client::rmsnap(const char *relpath, const char *name) { Mutex::Locker l(client_lock); filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - Inode *snapdir = open_snapdir(in); + Inode *snapdir = open_snapdir(in.get()); return _rmdir(snapdir, name); } @@ -8171,7 +8156,7 @@ int Client::get_caps_issued(const char *path) { Mutex::Locker lock(client_lock); filepath p(path); - Inode *in; + InodeRef in; int r = path_walk(p, &in, true); if (r < 0) return r; @@ -8186,7 +8171,7 @@ Inode *Client::open_snapdir(Inode *diri) Inode *in; vinodeno_t vino(diri->ino, CEPH_SNAPDIR); if (!inode_map.count(vino)) { - in = new Inode(cct, vino, &diri->layout); + in = new Inode(this, vino, &diri->layout); in->ino = diri->ino; in->snapid = CEPH_SNAPDIR; @@ -8200,7 +8185,6 @@ Inode *Client::open_snapdir(Inode *diri) in->dirfragtree.clear(); inode_map[vino] = in; in->snapdir_parent = diri; - diri->get(); ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl; } else { in = inode_map[vino]; @@ -8218,7 +8202,7 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr, tout(cct) << name << std::endl; string dname(name); - Inode *in; + InodeRef in; int r = 0; r = _lookup(parent, dname, &in); @@ -8229,40 +8213,38 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr, assert(in); fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); out: ldout(cct, 3) << "ll_lookup " << parent << " " << name << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; tout(cct) << attr->st_ino << std::endl; - *out = in; + *out = in.get(); return r; } -int Client::ll_walk(const char* name, Inode **i, struct stat *attr) +int Client::ll_walk(const char* name, Inode **out, struct stat *attr) { Mutex::Locker lock(client_lock); filepath fp(name, 0); - Inode *destination = NULL; + InodeRef in; int rc; ldout(cct, 3) << "ll_walk" << name << dendl; tout(cct) << "ll_walk" << std::endl; tout(cct) << name << std::endl; - rc = path_walk(fp, &destination, false); - if (rc < 0) - { - attr->st_ino = 0; - *i = NULL; - return rc; - } - else - { - fill_stat(destination, attr); - *i = destination; - return 0; - } + rc = path_walk(fp, &in, false); + if (rc < 0) { + attr->st_ino = 0; + *out = NULL; + return rc; + } else { + assert(in); + fill_stat(in, attr); + *out = in.get(); + return 0; + } } @@ -8406,12 +8388,13 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask, int uid, tout(cct) << attr->st_atime << std::endl; tout(cct) << mask << std::endl; - Inode *target = in; + InodeRef target(in); int res = _setattr(in, attr, mask, uid, gid, &target); if (res == 0) { - assert(in == target); + assert(in == target.get()); fill_stat(in, attr); } + ldout(cct, 3) << "ll_setattr " << vino << " = " << res << dendl; return res; } @@ -8423,81 +8406,81 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask, int uid, int Client::getxattr(const char *path, const char *name, void *value, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid()); + return Client::_getxattr(in.get(), name, value, size, getuid(), getgid()); } int Client::lgetxattr(const char *path, const char *name, void *value, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid()); + return Client::_getxattr(in.get(), name, value, size, getuid(), getgid()); } int Client::listxattr(const char *path, char *list, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_listxattr(ceph_inode, list, size, getuid(), getgid()); + return Client::_listxattr(in.get(), list, size, getuid(), getgid()); } int Client::llistxattr(const char *path, char *list, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_listxattr(ceph_inode, list, size, getuid(), getgid()); + return Client::_listxattr(in.get(), list, size, getuid(), getgid()); } int Client::removexattr(const char *path, const char *name) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_removexattr(ceph_inode, name, getuid(), getgid()); + return Client::_removexattr(in.get(), name, getuid(), getgid()); } int Client::lremovexattr(const char *path, const char *name) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_removexattr(ceph_inode, name, getuid(), getgid()); + return Client::_removexattr(in.get(), name, getuid(), getgid()); } int Client::setxattr(const char *path, const char *name, const void *value, size_t size, int flags) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid()); + return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid()); } int Client::lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid()); + return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid()); } int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, @@ -8953,7 +8936,7 @@ int Client::ll_readlink(Inode *in, char *buf, size_t buflen, int uid, int gid) } int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, - int uid, int gid, Inode **inp) + int uid, int gid, InodeRef *inp) { ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct << mode << dec << ", " << rdev << ", uid " << uid << ", gid " @@ -9014,21 +8997,21 @@ int Client::ll_mknod(Inode *parent, const char *name, mode_t mode, tout(cct) << mode << std::endl; tout(cct) << rdev << std::endl; - Inode *in = NULL; + InodeRef in; int r = _mknod(parent, name, mode, rdev, uid, gid, &in); if (r == 0) { fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); } tout(cct) << attr->st_ino << std::endl; ldout(cct, 3) << "ll_mknod " << vparent << " " << name << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - *out = in; + *out = in.get(); return r; } int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, - Inode **inp, Fh **fhp, int stripe_unit, int stripe_count, + InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count, int object_size, const char *data_pool, bool *created, int uid, int gid) { @@ -9093,7 +9076,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, /* If the caller passed a value in fhp, do the open */ if(fhp) { (*inp)->get_open_ref(cmode); - *fhp = _create_fh(*inp, flags, cmode); + *fhp = _create_fh(inp->get(), flags, cmode); } reply_error: @@ -9113,7 +9096,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, int Client::_mkdir(Inode *dir, const char *name, mode_t mode, int uid, int gid, - Inode **inp) + InodeRef *inp) { ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct << mode << dec << ", uid " << uid << ", gid " << gid << ")" @@ -9173,21 +9156,21 @@ int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode, tout(cct) << name << std::endl; tout(cct) << mode << std::endl; - Inode *in = NULL; + InodeRef in; int r = _mkdir(parent, name, mode, uid, gid, &in); if (r == 0) { fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); } tout(cct) << attr->st_ino << std::endl; ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - *out = in; + *out = in.get(); return r; } int Client::_symlink(Inode *dir, const char *name, const char *target, int uid, - int gid, Inode **inp) + int gid, InodeRef *inp) { ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target << ", uid " << uid << ", gid " << gid << ")" << dendl; @@ -9245,16 +9228,16 @@ int Client::ll_symlink(Inode *parent, const char *name, const char *value, tout(cct) << name << std::endl; tout(cct) << value << std::endl; - Inode *in = NULL; + InodeRef in; int r = _symlink(parent, name, value, uid, gid, &in); if (r == 0) { fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); } tout(cct) << attr->st_ino << std::endl; ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - *out = in; + *out = in.get(); return r; } @@ -9273,6 +9256,8 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid) path.push_dentry(name); req->set_filepath(path); + InodeRef otherin; + Dentry *de; int res = get_or_create(dir, name, &de); if (res < 0) @@ -9281,11 +9266,10 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid) req->dentry_drop = CEPH_CAP_FILE_SHARED; req->dentry_unless = CEPH_CAP_FILE_EXCL; - Inode *otherin; res = _lookup(dir, name, &otherin); if (res < 0) goto fail; - req->set_other_inode(otherin); + req->set_other_inode(otherin.get()); req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; req->set_inode(dir); @@ -9334,17 +9318,18 @@ int Client::_rmdir(Inode *dir, const char *name, int uid, int gid) req->dentry_unless = CEPH_CAP_FILE_EXCL; req->inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + InodeRef in; + Dentry *de; int res = get_or_create(dir, name, &de); if (res < 0) goto fail; - Inode *in; res = _lookup(dir, name, &in); if (res < 0) goto fail; if (req->get_op() == CEPH_MDS_OP_RMDIR) { req->set_dentry(de); - req->set_inode(in); + req->set_other_inode(in.get()); } else { unlink(de, true, true); } @@ -9391,6 +9376,7 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch return -EXDEV; } + InodeRef target; MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RENAME); filepath from; @@ -9418,27 +9404,26 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch req->dentry_drop = CEPH_CAP_FILE_SHARED; req->dentry_unless = CEPH_CAP_FILE_EXCL; - Inode *oldin; - res = _lookup(fromdir, fromname, &oldin); - if (res < 0) - goto fail; - req->set_old_inode(oldin); - req->old_inode_drop = CEPH_CAP_LINK_SHARED; + { + InodeRef oldin, otherin; + res = _lookup(fromdir, fromname, &oldin); + if (res < 0) + goto fail; + req->set_old_inode(oldin.get()); + req->old_inode_drop = CEPH_CAP_LINK_SHARED; - Inode *otherin; - res = _lookup(todir, toname, &otherin); - if (res != 0 && res != -ENOENT) { - goto fail; - } else if (res == 0) { - req->set_other_inode(otherin); - req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - } + res = _lookup(todir, toname, &otherin); + if (res != 0 && res != -ENOENT) { + goto fail; + } else if (res == 0) { + req->set_other_inode(otherin.get()); + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + } - req->set_inode(todir); + req->set_inode(todir); + } - Inode *target; res = make_request(req, uid, gid, &target); - ldout(cct, 10) << "rename result is " << res << dendl; // renamed item from our cache @@ -9471,7 +9456,7 @@ int Client::ll_rename(Inode *parent, const char *name, Inode *newparent, return _rename(parent, name, newparent, newname, uid, gid); } -int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, Inode **inp) +int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, InodeRef *inp) { ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname << " uid " << uid << " gid " << gid << ")" << dendl; @@ -9530,10 +9515,12 @@ int Client::ll_link(Inode *parent, Inode *newparent, const char *newname, tout(cct) << vnewparent << std::endl; tout(cct) << newname << std::endl; - int r = _link(parent, newparent, newname, uid, gid, &parent); + InodeRef target; + int r = _link(parent, newparent, newname, uid, gid, &target); if (r == 0) { - fill_stat(parent, attr); - _ll_get(parent); + assert(target); + fill_stat(target, attr); + _ll_get(target.get()); } return r; } @@ -9583,6 +9570,11 @@ int Client::ll_file_layout(Inode *in, ceph_file_layout *layout) return 0; } +int Client::ll_file_layout(Fh *fh, ceph_file_layout *layout) +{ + return ll_file_layout(fh->inode.get(), layout); +} + /* Currently we cannot take advantage of redundancy in reads, since we would have to go through all possible placement groups (a potentially quite large number determined by a hash), and use CRUSH @@ -9713,7 +9705,7 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, tout(cct) << flags << std::endl; bool created = false; - Inode *in = NULL; + InodeRef in; int r = _lookup(parent, name, &in); if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) @@ -9724,9 +9716,6 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, 0, 0, 0, NULL, &created, uid, gid); if (r < 0) goto out; - - if ((!in) && fhp) - in = (*fhp)->inode; } if (r < 0) @@ -9737,7 +9726,7 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, ldout(cct, 20) << "ll_create created = " << created << dendl; if (!created) { - r = check_permissions(in, flags, uid, gid); + r = check_permissions(in.get(), flags, uid, gid); if (r < 0) { if (fhp && *fhp) { int release_r = _release_fh(*fhp); @@ -9746,7 +9735,7 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, goto out; } if (fhp && (*fhp == NULL)) { - r = _open(in, flags, mode, fhp); + r = _open(in.get(), flags, mode, fhp); if (r < 0) goto out; } @@ -9766,8 +9755,8 @@ out: // passing an Inode in outp requires an additional ref if (outp) { if (in) - _ll_get(in); - *outp = in; + _ll_get(in.get()); + *outp = in.get(); } return r; @@ -9984,7 +9973,7 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) if (objecter->osdmap_full_flag() && !(mode & FALLOC_FL_PUNCH_HOLE)) return -ENOSPC; - Inode *in = fh->inode; + Inode *in = fh->inode.get(); if (in->snapid != CEPH_NOSNAP) return -EROFS; @@ -10216,7 +10205,7 @@ int Client::describe_layout(const char *relpath, ceph_file_layout *lp) Mutex::Locker lock(client_lock); filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -10234,7 +10223,7 @@ int Client::fdescribe_layout(int fd, ceph_file_layout *lp) Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); *lp = in->layout; @@ -10285,7 +10274,7 @@ int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector& o Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); vector extents; Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents); @@ -10339,7 +10328,7 @@ int Client::get_file_stripe_address(int fd, loff_t offset, vector Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); // which object? vector extents; @@ -10385,7 +10374,7 @@ int Client::enumerate_layout(int fd, vector& result, Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); // map to a list of extents Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result); @@ -10534,7 +10523,7 @@ Inode *Client::get_quota_root(Inode *in) if (!in->dn_set.empty()) in = in->get_first_parent()->dir->parent_inode; else if (root_parents.count(in)) - in = root_parents[in]; + in = root_parents[in].get(); else in = NULL; } @@ -10760,3 +10749,12 @@ void Client::set_cap_epoch_barrier(epoch_t e) cap_epoch_barrier = e; } +void intrusive_ptr_add_ref(Inode *in) +{ + in->get(); +} + +void intrusive_ptr_release(Inode *in) +{ + in->client->put_inode(in); +} diff --git a/ceph/src/client/Client.h b/ceph/src/client/Client.h index b476f5ef..ff3875e6 100644 --- a/ceph/src/client/Client.h +++ b/ceph/src/client/Client.h @@ -47,12 +47,13 @@ using std::fstream; #include "common/Mutex.h" #include "common/Timer.h" #include "common/Finisher.h" - #include "common/compiler_extensions.h" #include "common/cmdparse.h" #include "osdc/ObjectCacher.h" +#include "InodeRef.h" + class MDSMap; class MonClient; @@ -119,7 +120,6 @@ struct DirEntry { DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {} }; -struct Inode; struct Cap; class Dir; class Dentry; @@ -138,7 +138,7 @@ typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino, vinodeno_t ino, string& name); typedef int (*client_remount_callback_t)(void *handle); -typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids); +typedef int (*client_getgroups_callback_t)(void *handle, gid_t **sgids); typedef void(*client_switch_interrupt_callback_t)(void *req, void *data); struct client_callback_args { @@ -169,7 +169,7 @@ struct dir_result_t { } - Inode *inode; + InodeRef inode; int64_t offset; // high bits: frag_t, low bits: an offset @@ -182,7 +182,7 @@ struct dir_result_t { int start_shared_gen; // dir shared_gen at start of readdir frag_t buffer_frag; - vector > *buffer; + vector > *buffer; string at_cache_name; // last entry we successfully returned @@ -300,12 +300,12 @@ public: int make_request(MetaRequest *req, int uid, int gid, //MClientRequest *req, int uid, int gid, - Inode **ptarget = 0, bool *pcreated = 0, + InodeRef *ptarget = 0, bool *pcreated = 0, int use_mds=-1, bufferlist *pdirbl=0); void put_request(MetaRequest *request); int verify_reply_trace(int r, MetaRequest *request, MClientReply *reply, - Inode **ptarget, bool *pcreated, int uid, int gid); + InodeRef *ptarget, bool *pcreated, int uid, int gid); void encode_cap_releases(MetaRequest *request, mds_rank_t mds); int encode_inode_release(Inode *in, MetaRequest *req, mds_rank_t mds, int drop, @@ -334,7 +334,7 @@ public: public: entity_name_t get_myname() { return messenger->get_myname(); } - void sync_write_commit(Inode *in); + void sync_write_commit(InodeRef& in); protected: Filer *filer; @@ -345,7 +345,7 @@ protected: // cache ceph::unordered_map inode_map; Inode* root; - map root_parents; + map root_parents; Inode* root_ancestor; LRU lru; // lru list of Dentry's in our local metadata cache. @@ -422,6 +422,7 @@ protected: friend class C_Client_SyncCommit; // Asserts on client_lock friend class C_Client_RequestInterrupt; friend class C_Client_Remount; + friend void intrusive_ptr_release(Inode *in); //int get_cache_size() { return lru.lru_get_size(); } //void set_cache_size(int m) { lru.lru_set_max(m); } @@ -435,9 +436,12 @@ protected: void unlink(Dentry *dn, bool keepdir, bool keepdentry); // path traversal for high-level interface - Inode *cwd; - int path_walk(const filepath& fp, Inode **end, bool followsym=true); + InodeRef cwd; + int path_walk(const filepath& fp, InodeRef *end, bool followsym=true); int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0); + int fill_stat(InodeRef& in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0) { + return fill_stat(in.get(), st, dirstat, rstat); + } void touch_dn(Dentry *dn); // trim cache. @@ -559,7 +563,7 @@ protected: void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps); void _invalidate_inode_cache(Inode *in); void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len); - void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps); + void _async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps); void _release(Inode *in); /** @@ -644,27 +648,34 @@ private: // internal interface // call these with client_lock held! - int _do_lookup(Inode *dir, const string& name, Inode **target); - int _lookup(Inode *dir, const string& dname, Inode **target); + int _do_lookup(Inode *dir, const string& name, InodeRef *target); + int _lookup(Inode *dir, const string& dname, InodeRef *target); - int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, Inode **inp = 0); + int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, InodeRef *inp = 0); int _unlink(Inode *dir, const char *name, int uid=-1, int gid=-1); int _rename(Inode *olddir, const char *oname, Inode *ndir, const char *nname, int uid=-1, int gid=-1); - int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, Inode **inp = 0); + int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, InodeRef *inp = 0); int _rmdir(Inode *dir, const char *name, int uid=-1, int gid=-1); - int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, Inode **inp = 0); - int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, Inode **inp = 0); - int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, Inode **inp = 0); + int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, InodeRef *inp = 0); + int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, InodeRef *inp = 0); + int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0); + int _setattr(InodeRef &in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0) { + return _setattr(in.get(), attr, mask, uid, gid, inp); + } int _getattr(Inode *in, int mask, int uid=-1, int gid=-1, bool force=false); + int _getattr(InodeRef &in, int mask, int uid=-1, int gid=-1, bool force=false) { + return _getattr(in.get(), mask, uid, gid, force); + } int _readlink(Inode *in, char *buf, size_t size); int _getxattr(Inode *in, const char *name, void *value, size_t len, int uid=-1, int gid=-1); int _listxattr(Inode *in, char *names, size_t len, int uid=-1, int gid=-1); int _setxattr(Inode *in, const char *name, const void *value, size_t len, int flags, int uid=-1, int gid=-1); int _removexattr(Inode *in, const char *nm, int uid=-1, int gid=-1); int _open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid=-1, int gid=-1); - int _create(Inode *in, const char *name, int flags, mode_t mode, Inode **inp, Fh **fhp, + int _create(Inode *in, const char *name, int flags, mode_t mode, InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count, int object_size, const char *data_pool, bool *created = NULL, int uid=-1, int gid=-1); + loff_t _lseek(Fh *fh, loff_t offset, int whence); int _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl); int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf); @@ -952,6 +963,7 @@ public: int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner); int ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req); int ll_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req); + int ll_file_layout(Fh *fh, ceph_file_layout *layout); void ll_interrupt(void *d); int ll_get_stripe_osd(struct Inode *in, uint64_t blockno, ceph_file_layout* layout); diff --git a/ceph/src/client/Dentry.h b/ceph/src/client/Dentry.h index aad6343f..198b375c 100644 --- a/ceph/src/client/Dentry.h +++ b/ceph/src/client/Dentry.h @@ -5,17 +5,18 @@ #include "include/xlist.h" #include "mds/mdstypes.h" +#include "InodeRef.h" class Dir; struct Inode; class Dentry : public LRUObject { public: - string name; // sort of lame + string name; // sort of lame //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. + Dir *dir; + InodeRef inode; + int ref; // 1 if there's a dir beneath me. uint64_t offset; mds_rank_t lease_mds; utime_t lease_ttl; @@ -47,7 +48,7 @@ class Dentry : public LRUObject { void dump(Formatter *f) const; Dentry() : - dir(0), inode(0), ref(1), offset(0), + dir(0), ref(1), offset(0), lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0), item_dentry_list(this) { } private: diff --git a/ceph/src/client/Fh.h b/ceph/src/client/Fh.h index dcf70cdb..db3a28c4 100644 --- a/ceph/src/client/Fh.h +++ b/ceph/src/client/Fh.h @@ -3,16 +3,16 @@ #include "common/Readahead.h" #include "include/types.h" +#include "InodeRef.h" -struct Inode; class Cond; class ceph_lock_state_t; // file handle for any open file state struct Fh { + InodeRef inode; int _ref; - Inode *inode; loff_t pos; int mds; // have to talk to mds we opened with (for now) int mode; // the mode i opened the file with @@ -27,7 +27,7 @@ struct Fh { ceph_lock_state_t *fcntl_locks; ceph_lock_state_t *flock_locks; - Fh() : _ref(1), inode(0), pos(0), mds(0), mode(0), flags(0), pos_locked(false), + Fh() : _ref(1), pos(0), mds(0), mode(0), flags(0), pos_locked(false), readahead(), fcntl_locks(NULL), flock_locks(NULL) {} void get() { ++_ref; } int put() { return --_ref; } diff --git a/ceph/src/client/Inode.cc b/ceph/src/client/Inode.cc index c63ba1c9..219af9fd 100644 --- a/ceph/src/client/Inode.cc +++ b/ceph/src/client/Inode.cc @@ -1,10 +1,11 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include "MetaSession.h" +#include "Client.h" #include "Inode.h" #include "Dentry.h" #include "Dir.h" +#include "MetaSession.h" #include "ClientSnapRealm.h" ostream& operator<<(ostream &out, Inode &in) @@ -126,7 +127,7 @@ int Inode::put_cap_ref(int cap) if (cap & 1) { int c = 1 << n; if (cap_refs[c] <= 0) { - lderr(cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl; + lderr(client->cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl; assert(cap_refs[c] > 0); } if (--cap_refs[c] == 0) @@ -151,7 +152,7 @@ bool Inode::cap_is_valid(Cap* cap) << "cap expire " << cap->session->cap_ttl << std::endl << "cur time " << ceph_clock_now(cct) << std::endl;*/ if ((cap->session->cap_gen <= cap->gen) - && (ceph_clock_now(cct) < cap->session->cap_ttl)) { + && (ceph_clock_now(client->cct) < cap->session->cap_ttl)) { return true; } return true; @@ -268,7 +269,7 @@ Dir *Inode::open_dir() { if (!dir) { dir = new Dir(this); - lsubdout(cct, mds, 15) << "open_dir " << dir << " on " << this << dendl; + lsubdout(client->cct, client, 15) << "open_dir " << dir << " on " << this << dendl; assert(dn_set.size() < 2); // dirs can't be hard-linked if (!dn_set.empty()) (*dn_set.begin())->get(); // pin dentry @@ -307,6 +308,21 @@ bool Inode::check_mode(uid_t ruid, gid_t rgid, gid_t *sgids, int sgids_count, ui return (mode & fmode) == fmode; } +void Inode::get() { + _ref++; + lsubdout(client->cct, client, 15) << "inode.get on " << this << " " << ino << '.' << snapid + << " now " << _ref << dendl; +} + +//private method to put a reference; see Client::put_inode() +int Inode::_put(int n) { + _ref -= n; + lsubdout(client->cct, client, 15) << "inode.put on " << this << " " << ino << '.' << snapid + << " now " << _ref << dendl; + assert(_ref >= 0); + return _ref; +} + void Inode::dump(Formatter *f) const { diff --git a/ceph/src/client/Inode.h b/ceph/src/client/Inode.h index 4a274026..90107ec0 100644 --- a/ceph/src/client/Inode.h +++ b/ceph/src/client/Inode.h @@ -13,6 +13,9 @@ #include "osdc/ObjectCacher.h" #include "include/assert.h" +#include "InodeRef.h" + +class Client; struct MetaSession; class Dentry; class Dir; @@ -41,7 +44,7 @@ struct Cap { struct CapSnap { //snapid_t follows; // map key - Inode *in; + InodeRef in; SnapContext context; int issued, dirty; @@ -147,7 +150,7 @@ public: #define I_DIR_ORDERED 2 struct Inode { - CephContext *cct; + Client *client; // -- the actual inode -- inodeno_t ino; @@ -236,7 +239,7 @@ struct Inode { SnapRealm *snaprealm; xlist::item snaprealm_item; - Inode *snapdir_parent; // only if we are a snapdir inode + InodeRef snapdir_parent; // only if we are a snapdir inode map cap_snaps; // pending flush to mds //int open_by_mode[CEPH_FILE_MODE_NUM]; @@ -267,19 +270,8 @@ struct Inode { void make_long_path(filepath& p); void make_nosnap_relative_path(filepath& p); - void get() { - _ref++; - lsubdout(cct, mds, 15) << "inode.get on " << this << " " << ino << '.' << snapid - << " now " << _ref << dendl; - } - /// private method to put a reference; see Client::put_inode() - int _put(int n=1) { - _ref -= n; - lsubdout(cct, mds, 15) << "inode.put on " << this << " " << ino << '.' << snapid - << " now " << _ref << dendl; - assert(_ref >= 0); - return _ref; - } + void get(); + int _put(int n=1); int get_num_ref() { return _ref; @@ -297,8 +289,8 @@ struct Inode { ceph_lock_state_t *fcntl_locks; ceph_lock_state_t *flock_locks; - Inode(CephContext *cct_, vinodeno_t vino, ceph_file_layout *newlayout) - : cct(cct_), ino(vino.ino), snapid(vino.snapid), + Inode(Client *c, vinodeno_t vino, ceph_file_layout *newlayout) + : client(c), ino(vino.ino), snapid(vino.snapid), rdev(0), mode(0), uid(0), gid(0), nlink(0), size(0), truncate_seq(1), truncate_size(-1), time_warp_seq(0), max_size(0), version(0), xattr_version(0), @@ -309,7 +301,7 @@ struct Inode { dirty_caps(0), flushing_caps(0), flushing_cap_seq(0), shared_gen(0), cache_gen(0), snap_caps(0), snap_cap_refs(0), cap_item(this), flushing_cap_item(this), last_flush_tid(0), - snaprealm(0), snaprealm_item(this), snapdir_parent(0), + snaprealm(0), snaprealm_item(this), oset((void *)this, newlayout->fl_pg_pool, ino), reported_size(0), wanted_max_size(0), requested_max_size(0), _ref(0), ll_ref(0), dir(0), dn_set(), diff --git a/ceph/src/client/InodeRef.h b/ceph/src/client/InodeRef.h new file mode 100644 index 00000000..822ec0ff --- /dev/null +++ b/ceph/src/client/InodeRef.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_INODEREF_H +#define CEPH_CLIENT_INODEREF_H + +#include +class Inode; +void intrusive_ptr_add_ref(Inode *in); +void intrusive_ptr_release(Inode *in); +typedef boost::intrusive_ptr InodeRef; +#endif diff --git a/ceph/src/client/Makefile.am b/ceph/src/client/Makefile.am index 60dd2278..0d2a5fa0 100644 --- a/ceph/src/client/Makefile.am +++ b/ceph/src/client/Makefile.am @@ -22,7 +22,8 @@ noinst_HEADERS += \ client/SyntheticClient.h \ client/Trace.h \ client/ioctl.h \ - client/ObjecterWriteback.h + client/ObjecterWriteback.h \ + client/InodeRef.h if WITH_FUSE libclient_fuse_la_SOURCES = client/fuse_ll.cc diff --git a/ceph/src/client/MetaRequest.cc b/ceph/src/client/MetaRequest.cc index c8c4552d..330edde1 100644 --- a/ceph/src/client/MetaRequest.cc +++ b/ceph/src/client/MetaRequest.cc @@ -57,9 +57,6 @@ void MetaRequest::dump(Formatter *f) const MetaRequest::~MetaRequest() { - assert(!_inode); - assert(!_old_inode); - assert(!_other_inode); if (_dentry) _dentry->put(); if (_old_dentry) @@ -68,33 +65,6 @@ MetaRequest::~MetaRequest() reply->put(); } -void MetaRequest::set_inode(Inode *in) { - assert(_inode == NULL); - _inode = in; - _inode->get(); -} -Inode *MetaRequest::inode() { - return _inode; -} - -void MetaRequest::set_old_inode(Inode *in) { - assert(_old_inode == NULL); - _old_inode = in; - _old_inode->get(); -} -Inode *MetaRequest::old_inode() { - return _old_inode; -} - -void MetaRequest::set_other_inode(Inode *in) { - assert(_other_inode == NULL); - _other_inode = in; - _other_inode->get(); -} -Inode *MetaRequest::other_inode() { - return _other_inode; -} - void MetaRequest::set_dentry(Dentry *d) { assert(_dentry == NULL); _dentry = d; diff --git a/ceph/src/client/MetaRequest.h b/ceph/src/client/MetaRequest.h index e3b6bd16..aeb68cb4 100644 --- a/ceph/src/client/MetaRequest.h +++ b/ceph/src/client/MetaRequest.h @@ -11,19 +11,18 @@ #include "include/filepath.h" #include "include/atomic.h" #include "mds/mdstypes.h" +#include "InodeRef.h" #include "common/Mutex.h" #include "messages/MClientRequest.h" class MClientReply; -struct Inode; class Dentry; struct MetaRequest { private: - Inode *_inode; - Inode *_old_inode, *_other_inode; + InodeRef _inode, _old_inode, _other_inode; Dentry *_dentry; //associated with path Dentry *_old_dentry; //associated with path2 public: @@ -61,7 +60,7 @@ public: uint64_t readdir_offset; frag_t readdir_reply_frag; - vector > readdir_result; + vector > readdir_result; bool readdir_end; int readdir_num; string readdir_last_name; @@ -76,10 +75,9 @@ public: Cond *caller_cond; // who to take up Cond *dispatch_cond; // who to kick back - Inode *target; + InodeRef target; MetaRequest(int op) : - _inode(NULL), _old_inode(NULL), _other_inode(NULL), _dentry(NULL), _old_dentry(NULL), tid(0), inode_drop(0), inode_unless(0), @@ -95,33 +93,38 @@ public: readdir_offset(0), readdir_end(false), readdir_num(0), got_unsafe(false), item(this), unsafe_item(this), lock("MetaRequest lock"), - caller_cond(0), dispatch_cond(0), - target(0) { + caller_cond(0), dispatch_cond(0) { memset(&head, 0, sizeof(ceph_mds_request_head)); head.op = op; } ~MetaRequest(); - void set_inode(Inode *in); - Inode *inode(); - Inode *take_inode() { - Inode *i = _inode; - _inode = 0; - return i; - } - void set_old_inode(Inode *in); - Inode *old_inode(); - Inode *take_old_inode() { - Inode *i = _old_inode; - _old_inode = NULL; - return i; - } - void set_other_inode(Inode *in); - Inode *other_inode(); - Inode *take_other_inode() { - Inode *i = _other_inode; - _other_inode = 0; - return i; + void set_inode(Inode *in) { + _inode = in; + } + Inode *inode() { + return _inode.get(); + } + void take_inode(InodeRef *out) { + out->swap(_inode); + } + void set_old_inode(Inode *in) { + _old_inode = in; + } + Inode *old_inode() { + return _old_inode.get(); + } + void take_old_inode(InodeRef *out) { + out->swap(_old_inode); + } + void set_other_inode(Inode *in) { + _old_inode = in; + } + Inode *other_inode() { + return _other_inode.get(); + } + void take_other_inode(InodeRef *out) { + out->swap(_other_inode); } void set_dentry(Dentry *d); Dentry *dentry(); diff --git a/ceph/src/client/fuse_ll.cc b/ceph/src/client/fuse_ll.cc index f9c380c9..caf6f0f6 100644 --- a/ceph/src/client/fuse_ll.cc +++ b/ceph/src/client/fuse_ll.cc @@ -90,12 +90,23 @@ public: ceph::unordered_map snap_stag_map; ceph::unordered_map stag_snap_map; + pthread_key_t fuse_req_key; + void set_fuse_req(fuse_req_t); + fuse_req_t get_fuse_req(); + struct fuse_args args; }; -static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +static CephFuse::Handle *fuse_ll_req_prepare(fuse_req_t req) { CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + cfuse->set_fuse_req(req); + return cfuse; +} + +static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); struct fuse_entry_param fe; Inode *i2, *i1 = cfuse->iget(parent); // see below @@ -119,7 +130,7 @@ static void fuse_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) static void fuse_ll_forget(fuse_req_t req, fuse_ino_t ino, long unsigned nlookup) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); cfuse->client->ll_forget(cfuse->iget(ino), nlookup+1); fuse_reply_none(req); } @@ -127,7 +138,7 @@ static void fuse_ll_forget(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); struct stat stbuf; @@ -148,7 +159,7 @@ static void fuse_ll_getattr(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, int to_set, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); @@ -174,7 +185,7 @@ static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, const char *value, size_t size, int flags) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); @@ -187,7 +198,7 @@ static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); char buf[size]; @@ -206,7 +217,7 @@ static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, size_t size) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); char buf[size]; @@ -225,7 +236,7 @@ static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, static void fuse_ll_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); @@ -239,13 +250,13 @@ static void fuse_ll_removexattr(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); void *dirp; - int r = cfuse->client->ll_opendir(in, (dir_result_t **) &dirp, ctx->uid, - ctx->gid); + int r = cfuse->client->ll_opendir(in, (dir_result_t **)&dirp, + ctx->uid, ctx->gid); if (r >= 0) { fi->fh = (long)dirp; fuse_reply_open(req, fi); @@ -258,7 +269,7 @@ static void fuse_ll_opendir(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_readlink(fuse_req_t req, fuse_ino_t ino) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); char buf[PATH_MAX + 1]; // leave room for a null terminator @@ -277,7 +288,7 @@ static void fuse_ll_readlink(fuse_req_t req, fuse_ino_t ino) static void fuse_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, mode_t mode, dev_t rdev) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *i2, *i1 = cfuse->iget(parent); struct fuse_entry_param fe; @@ -302,7 +313,7 @@ static void fuse_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, static void fuse_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, mode_t mode) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *i2, *i1 = cfuse->iget(parent); struct fuse_entry_param fe; @@ -326,7 +337,7 @@ static void fuse_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, static void fuse_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(parent); @@ -338,7 +349,7 @@ static void fuse_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) static void fuse_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(parent); @@ -351,7 +362,7 @@ static void fuse_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) static void fuse_ll_symlink(fuse_req_t req, const char *existing, fuse_ino_t parent, const char *name) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *i2, *i1 = cfuse->iget(parent); struct fuse_entry_param fe; @@ -376,7 +387,7 @@ static void fuse_ll_symlink(fuse_req_t req, const char *existing, static void fuse_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, fuse_ino_t newparent, const char *newname) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(parent); Inode *nin = cfuse->iget(newparent); @@ -391,7 +402,7 @@ static void fuse_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, static void fuse_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, const char *newname) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); Inode *nin = cfuse->iget(newparent); @@ -416,7 +427,7 @@ static void fuse_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, static void fuse_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *in = cfuse->iget(ino); Fh *fh = NULL; @@ -439,7 +450,7 @@ static void fuse_ll_open(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); bufferlist bl; int r = cfuse->client->ll_read(fh, off, size, &bl); @@ -452,7 +463,7 @@ static void fuse_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, static void fuse_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t off, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); int r = cfuse->client->ll_write(fh, off, size, buf); if (r >= 0) @@ -464,7 +475,7 @@ static void fuse_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); int r = cfuse->client->ll_flush(fh); fuse_reply_err(req, -r); @@ -474,7 +485,7 @@ static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi, unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); if (flags & FUSE_IOCTL_COMPAT) { fuse_reply_err(req, ENOSYS); @@ -486,7 +497,7 @@ static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, st struct ceph_file_layout layout; struct ceph_ioctl_layout l; Fh *fh = (Fh*)fi->fh; - cfuse->client->ll_file_layout(fh->inode, &layout); + cfuse->client->ll_file_layout(fh, &layout); l.stripe_unit = layout.fl_stripe_unit; l.stripe_count = layout.fl_stripe_count; l.object_size = layout.fl_object_size; @@ -506,7 +517,7 @@ static void fuse_ll_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, off_t length, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = (Fh*)fi->fh; int r = cfuse->client->ll_fallocate(fh, mode, offset, length); fuse_reply_err(req, -r); @@ -517,7 +528,7 @@ static void fuse_ll_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); int r = cfuse->client->ll_release(fh); fuse_reply_err(req, -r); @@ -526,7 +537,7 @@ static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); int r = cfuse->client->ll_fsync(fh, datasync); fuse_reply_err(req, -r); @@ -567,7 +578,7 @@ static int fuse_ll_add_dirent(void *p, struct dirent *de, struct stat *st, static void fuse_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); dir_result_t *dirp = reinterpret_cast(fi->fh); cfuse->client->seekdir(dirp, off); @@ -590,7 +601,7 @@ static void fuse_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, static void fuse_ll_releasedir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); dir_result_t *dirp = reinterpret_cast(fi->fh); cfuse->client->ll_releasedir(dirp); fuse_reply_err(req, 0); @@ -604,7 +615,7 @@ static void fuse_ll_access(fuse_req_t req, fuse_ino_t ino, int mask) static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, mode_t mode, struct fuse_file_info *fi) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); const struct fuse_ctx *ctx = fuse_req_ctx(req); Inode *i1 = cfuse->iget(parent), *i2; struct fuse_entry_param fe; @@ -629,7 +640,7 @@ static void fuse_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, static void fuse_ll_statfs(fuse_req_t req, fuse_ino_t ino) { struct statvfs stbuf; - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Inode *in = cfuse->iget(ino); int r = cfuse->client->ll_statfs(in, &stbuf); @@ -644,7 +655,7 @@ static void fuse_ll_statfs(fuse_req_t req, fuse_ino_t ino) static void fuse_ll_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, struct flock *lock) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); int r = cfuse->client->ll_getlk(fh, lock, fi->lock_owner); @@ -657,7 +668,7 @@ static void fuse_ll_getlk(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, struct flock *lock, int sleep) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = reinterpret_cast(fi->fh); // must use multithread if operation may block @@ -673,7 +684,7 @@ static void fuse_ll_setlk(fuse_req_t req, fuse_ino_t ino, static void fuse_ll_interrupt(fuse_req_t req, void* data) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); cfuse->client->ll_interrupt(data); } @@ -689,7 +700,7 @@ static void switch_interrupt_cb(void *req, void* data) static void fuse_ll_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, int cmd) { - CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + CephFuse::Handle *cfuse = fuse_ll_req_prepare(req); Fh *fh = (Fh*)fi->fh; // must use multithread if operation may block @@ -704,12 +715,14 @@ static void fuse_ll_flock(fuse_req_t req, fuse_ino_t ino, } #endif -#if 0 -static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids) +static int getgroups_cb(void *handle, gid_t **sgids) { -#ifdef HAVE_FUSE_GETGROUPS +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + fuse_req_t req = cfuse->get_fuse_req(); + assert(sgids); - int c = fuse_getgroups(0, NULL); + int c = fuse_req_getgroups(req, 0, NULL); if (c < 0) { return c; } @@ -721,16 +734,16 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids) if (!*sgids) { return -ENOMEM; } - c = fuse_getgroups(c, *sgids); + c = fuse_req_getgroups(req, c, *sgids); if (c < 0) { free(*sgids); return c; } return c; +#else + return -ENOSYS; #endif - return 0; } -#endif static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len) @@ -878,10 +891,18 @@ void CephFuse::Handle::finalize() if (ch) fuse_unmount(mountpoint, ch); + pthread_key_delete(fuse_req_key); } int CephFuse::Handle::init(int argc, const char *argv[]) { + + int r = pthread_key_create(&fuse_req_key, NULL); + if (r) { + derr << "pthread_key_create failed." << dendl; + return r; + } + // set up fuse argc/argv int newargc = 0; const char **newargv = (const char **) malloc((argc + 10) * sizeof(char *)); @@ -960,17 +981,7 @@ int CephFuse::Handle::start() dentry_cb: dentry_invalidate_cb, switch_intr_cb: switch_interrupt_cb, remount_cb: remount_cb, - /* - * this is broken: - * - * - the cb needs the request handle to be useful; we should get the - * gids in the method here in fuse_ll.c and pass the gid list in, - * not use a callback. - * - the callback mallocs the list but it is not free()'d - * - * so disable it for now... - getgroups_cb: getgroups_cb, - */ + getgroups_cb: getgroups_cb, }; client->ll_register_callbacks(&args); @@ -1031,6 +1042,17 @@ uint64_t CephFuse::Handle::make_fake_ino(inodeno_t ino, snapid_t snapid) return fino; } +void CephFuse::Handle::set_fuse_req(fuse_req_t req) +{ + pthread_setspecific(fuse_req_key, (void*)req); +} + +fuse_req_t CephFuse::Handle::get_fuse_req() +{ + return (fuse_req_t) pthread_getspecific(fuse_req_key); +} + + CephFuse::CephFuse(Client *c, int fd) : _handle(new CephFuse::Handle(c, fd)) { } diff --git a/ceph/src/cls/rbd/cls_rbd.cc b/ceph/src/cls/rbd/cls_rbd.cc index ae2a4325..c10263c9 100644 --- a/ceph/src/cls/rbd/cls_rbd.cc +++ b/ceph/src/cls/rbd/cls_rbd.cc @@ -104,6 +104,7 @@ cls_method_handle_t h_old_snapshot_remove; #define RBD_SNAP_KEY_PREFIX "snapshot_" #define RBD_DIR_ID_KEY_PREFIX "id_" #define RBD_DIR_NAME_KEY_PREFIX "name_" +#define RBD_MAX_OBJECT_MAP_OBJECT_COUNT 256000000 static int snap_read_header(cls_method_context_t hctx, bufferlist& bl) { @@ -1996,6 +1997,12 @@ int object_map_resize(cls_method_context_t hctx, bufferlist *in, bufferlist *out return -EINVAL; } + // protect against excessive memory requirements + if (object_count > RBD_MAX_OBJECT_MAP_OBJECT_COUNT) { + CLS_ERR("object map too large: %" PRIu64, object_count); + return -EINVAL; + } + BitVector<2> object_map; int r = object_map_read(hctx, object_map); if ((r < 0) && (r != -ENOENT)) { diff --git a/ceph/src/cls/rgw/cls_rgw.cc b/ceph/src/cls/rgw/cls_rgw.cc index 980884a9..a06b0d83 100644 --- a/ceph/src/cls/rgw/cls_rgw.cc +++ b/ceph/src/cls/rgw/cls_rgw.cc @@ -1063,6 +1063,9 @@ public: initialized = true; } + void set_epoch(uint64_t epoch) { + instance_entry.versioned_epoch = epoch; + } int unlink_list_entry() { string list_idx; @@ -1540,12 +1543,27 @@ static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in, return ret; } - ret = olh.init(NULL); + bool olh_found; + ret = olh.init(&olh_found); if (ret < 0) { CLS_LOG(0, "ERROR: olh.init() returned ret=%d", ret); return ret; } + if (!olh_found) { + bool instance_only = false; + cls_rgw_obj_key key(dest_key.name); + ret = convert_plain_entry_to_versioned(hctx, key, true, instance_only); + if (ret < 0) { + CLS_LOG(0, "ERROR: convert_plain_entry_to_versioned ret=%d", ret); + return ret; + } + olh.update(dest_key, false); + olh.set_tag(op.olh_tag); + + obj.set_epoch(1); + } + if (!olh.start_modify(op.olh_epoch)) { ret = obj.unlink_list_entry(); if (ret < 0) { diff --git a/ceph/src/cls/rgw/cls_rgw_client.cc b/ceph/src/cls/rgw/cls_rgw_client.cc index e6ac56b8..49bb840c 100644 --- a/ceph/src/cls/rgw/cls_rgw_client.cc +++ b/ceph/src/cls/rgw/cls_rgw_client.cc @@ -310,13 +310,14 @@ int cls_rgw_bucket_link_olh(librados::IoCtx& io_ctx, const string& oid, const cl int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, const string& op_tag, - uint64_t olh_epoch, bool log_op) + const string& olh_tag, uint64_t olh_epoch, bool log_op) { bufferlist in, out; struct rgw_cls_unlink_instance_op call; call.key = key; call.op_tag = op_tag; call.olh_epoch = olh_epoch; + call.olh_tag = olh_tag; call.log_op = log_op; ::encode(call, in); int r = io_ctx.exec(oid, "rgw", "bucket_unlink_instance", in, out); diff --git a/ceph/src/cls/rgw/cls_rgw_client.h b/ceph/src/cls/rgw/cls_rgw_client.h index ecec6791..a3555e79 100644 --- a/ceph/src/cls/rgw/cls_rgw_client.h +++ b/ceph/src/cls/rgw/cls_rgw_client.h @@ -329,7 +329,7 @@ int cls_rgw_bucket_link_olh(librados::IoCtx& io_ctx, const string& oid, const cl bool delete_marker, const string& op_tag, struct rgw_bucket_dir_entry_meta *meta, uint64_t olh_epoch, bool log_op); int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, const string& op_tag, - uint64_t olh_epoch, bool log_op); + const string& olh_tag, uint64_t olh_epoch, bool log_op); int cls_rgw_get_olh_log(librados::IoCtx& io_ctx, string& oid, librados::ObjectReadOperation& op, const cls_rgw_obj_key& olh, uint64_t ver_marker, const string& olh_tag, map > *log, bool *is_truncated); diff --git a/ceph/src/cls/rgw/cls_rgw_ops.h b/ceph/src/cls/rgw/cls_rgw_ops.h index 0a0686fb..dece239f 100644 --- a/ceph/src/cls/rgw/cls_rgw_ops.h +++ b/ceph/src/cls/rgw/cls_rgw_ops.h @@ -204,26 +204,31 @@ struct rgw_cls_unlink_instance_op { uint64_t olh_epoch; bool log_op; uint16_t bilog_flags; + string olh_tag; rgw_cls_unlink_instance_op() : olh_epoch(0), log_op(false), bilog_flags(0) {} void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(key, bl); ::encode(op_tag, bl); ::encode(olh_epoch, bl); ::encode(log_op, bl); ::encode(bilog_flags, bl); + ::encode(olh_tag, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(key, bl); ::decode(op_tag, bl); ::decode(olh_epoch, bl); ::decode(log_op, bl); ::decode(bilog_flags, bl); + if (struct_v >= 2) { + ::decode(olh_tag, bl); + } DECODE_FINISH(bl); } diff --git a/ceph/src/common/Cycles.cc b/ceph/src/common/Cycles.cc index b0b687e4..656f08b4 100644 --- a/ceph/src/common/Cycles.cc +++ b/ceph/src/common/Cycles.cc @@ -38,14 +38,15 @@ #include "Cycles.h" double Cycles::cycles_per_sec = 0; -static Initialize _(Cycles::init); /** * Perform once-only overall initialization for the Cycles class, such - * as calibrating the clock frequency. This method is invoked automatically - * during initialization, but it may be invoked explicitly by other modules - * to ensure that initialization occurs before those modules initialize - * themselves. + * as calibrating the clock frequency. This method must be called + * before using the Cycles module. + * + * It is not initialized by default because the timing loops cause + * general process startup times to balloon + * (http://tracker.ceph.com/issues/15225). */ void Cycles::init() { diff --git a/ceph/src/common/WorkQueue.h b/ceph/src/common/WorkQueue.h index 7d453e6e..1c2e475d 100644 --- a/ceph/src/common/WorkQueue.h +++ b/ceph/src/common/WorkQueue.h @@ -310,13 +310,22 @@ public: class PointerWQ : public WorkQueue_ { public: PointerWQ(string n, time_t ti, time_t sti, ThreadPool* p) - : WorkQueue_(n, ti, sti), m_pool(p) { + : WorkQueue_(n, ti, sti), m_pool(p), m_processing(0) { m_pool->add_work_queue(this); } ~PointerWQ() { m_pool->remove_work_queue(this); + assert(m_processing == 0); } void drain() { + { + // if this queue is empty and not processing, don't wait for other + // queues to finish processing + Mutex::Locker l(m_pool->_lock); + if (m_processing == 0 && m_items.empty()) { + return; + } + } m_pool->drain(this); } void queue(T *item) { @@ -324,6 +333,10 @@ public: m_items.push_back(item); m_pool->_cond.SignalOne(); } + bool empty() { + Mutex::Locker l(m_pool->_lock); + return _empty(); + } protected: virtual void _clear() { assert(m_pool->_lock.is_locked()); @@ -339,6 +352,7 @@ public: return NULL; } + ++m_processing; T *item = m_items.front(); m_items.pop_front(); return item; @@ -347,6 +361,9 @@ public: process(reinterpret_cast(item)); } virtual void _void_process_finish(void *item) { + assert(m_pool->_lock.is_locked()); + assert(m_processing > 0); + --m_processing; } virtual void process(T *item) = 0; @@ -365,6 +382,7 @@ public: private: ThreadPool *m_pool; std::list m_items; + uint32_t m_processing; }; private: vector work_queues; diff --git a/ceph/src/common/bit_vector.hpp b/ceph/src/common/bit_vector.hpp index f66294b5..06600e9d 100644 --- a/ceph/src/common/bit_vector.hpp +++ b/ceph/src/common/bit_vector.hpp @@ -35,6 +35,7 @@ private: BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1))); BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE); public: + static const uint32_t BLOCK_SIZE; class ConstReference { public: @@ -110,6 +111,9 @@ private: }; +template +const uint32_t BitVector<_b>::BLOCK_SIZE = 4096; + template BitVector<_b>::BitVector() : m_size(0), m_crc_enabled(true) { @@ -135,7 +139,7 @@ void BitVector<_b>::resize(uint64_t size) { } m_size = size; - uint64_t block_count = (buffer_size + CEPH_PAGE_SIZE - 1) / CEPH_PAGE_SIZE; + uint64_t block_count = (buffer_size + BLOCK_SIZE - 1) / BLOCK_SIZE; m_data_crcs.resize(block_count); } @@ -190,26 +194,26 @@ uint64_t BitVector<_b>::get_header_length() const { template void BitVector<_b>::encode_data(bufferlist& bl, uint64_t byte_offset, uint64_t byte_length) const { - assert(byte_offset % CEPH_PAGE_SIZE == 0); + assert(byte_offset % BLOCK_SIZE == 0); assert(byte_offset + byte_length == m_data.length() || - byte_length % CEPH_PAGE_SIZE == 0); + byte_length % BLOCK_SIZE == 0); uint64_t end_offset = byte_offset + byte_length; while (byte_offset < end_offset) { - uint64_t len = MIN(CEPH_PAGE_SIZE, end_offset - byte_offset); + uint64_t len = MIN(BLOCK_SIZE, end_offset - byte_offset); bufferlist bit; bit.substr_of(m_data, byte_offset, len); - m_data_crcs[byte_offset / CEPH_PAGE_SIZE] = bit.crc32c(0); + m_data_crcs[byte_offset / BLOCK_SIZE] = bit.crc32c(0); bl.claim_append(bit); - byte_offset += CEPH_PAGE_SIZE; + byte_offset += BLOCK_SIZE; } } template void BitVector<_b>::decode_data(bufferlist::iterator& it, uint64_t byte_offset) { - assert(byte_offset % CEPH_PAGE_SIZE == 0); + assert(byte_offset % BLOCK_SIZE == 0); if (it.end()) { return; } @@ -225,12 +229,12 @@ void BitVector<_b>::decode_data(bufferlist::iterator& it, uint64_t byte_offset) } while (byte_offset < end_offset) { - uint64_t len = MIN(CEPH_PAGE_SIZE, end_offset - byte_offset); + uint64_t len = MIN(BLOCK_SIZE, end_offset - byte_offset); bufferlist bit; it.copy(len, bit); if (m_crc_enabled && - m_data_crcs[byte_offset / CEPH_PAGE_SIZE] != bit.crc32c(0)) { + m_data_crcs[byte_offset / BLOCK_SIZE] != bit.crc32c(0)) { throw buffer::malformed_input("invalid data block CRC"); } data.append(bit); @@ -250,15 +254,15 @@ template void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length, uint64_t *byte_offset, uint64_t *byte_length) const { - // read CEPH_PAGE_SIZE-aligned chunks + // read BLOCK_SIZE-aligned chunks assert(length > 0 && offset + length <= m_size); uint64_t shift; compute_index(offset, byte_offset, &shift); - *byte_offset -= (*byte_offset % CEPH_PAGE_SIZE); + *byte_offset -= (*byte_offset % BLOCK_SIZE); uint64_t end_offset; compute_index(offset + length - 1, &end_offset, &shift); - end_offset += (CEPH_PAGE_SIZE - (end_offset % CEPH_PAGE_SIZE)); + end_offset += (BLOCK_SIZE - (end_offset % BLOCK_SIZE)); assert(*byte_offset <= end_offset); *byte_length = end_offset - *byte_offset; @@ -292,7 +296,7 @@ void BitVector<_b>::decode_footer(bufferlist::iterator& it) { throw buffer::malformed_input("incorrect header CRC"); } - uint64_t block_count = (m_data.length() + CEPH_PAGE_SIZE - 1) / CEPH_PAGE_SIZE; + uint64_t block_count = (m_data.length() + BLOCK_SIZE - 1) / BLOCK_SIZE; ::decode(m_data_crcs, footer_it); if (m_data_crcs.size() != block_count) { throw buffer::malformed_input("invalid data block CRCs"); diff --git a/ceph/src/common/ceph_context.cc b/ceph/src/common/ceph_context.cc index dd1f4881..4fc7eeef 100644 --- a/ceph/src/common/ceph_context.cc +++ b/ceph/src/common/ceph_context.cc @@ -448,8 +448,8 @@ CephContext::CephContext(uint32_t module_type_) _admin_socket->register_command("log dump", "log dump", _admin_hook, "dump recent log entries to log file"); _admin_socket->register_command("log reopen", "log reopen", _admin_hook, "reopen log file"); - _crypto_none = new CryptoNone; - _crypto_aes = new CryptoAES; + _crypto_none = CryptoHandler::create(CEPH_CRYPTO_NONE); + _crypto_aes = CryptoHandler::create(CEPH_CRYPTO_AES); } CephContext::~CephContext() diff --git a/ceph/src/common/ceph_context.h b/ceph/src/common/ceph_context.h index 1fc26687..fd321c49 100644 --- a/ceph/src/common/ceph_context.h +++ b/ceph/src/common/ceph_context.h @@ -34,8 +34,6 @@ class md_config_obs_t; struct md_config_t; class CephContextHook; class CephContextObs; -class CryptoNone; -class CryptoAES; class CryptoHandler; namespace ceph { @@ -186,8 +184,8 @@ private: std::map _associated_objs; // crypto - CryptoNone *_crypto_none; - CryptoAES *_crypto_aes; + CryptoHandler *_crypto_none; + CryptoHandler *_crypto_aes; // experimental CephContextObs *_cct_obs; diff --git a/ceph/src/common/config_opts.h b/ceph/src/common/config_opts.h index fd53899a..c55694e8 100644 --- a/ceph/src/common/config_opts.h +++ b/ceph/src/common/config_opts.h @@ -204,6 +204,7 @@ OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds) OPTION(mon_accept_timeout, OPT_FLOAT, 10.0) // on leader, if paxos update isn't accepted +OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds) OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info) OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin @@ -235,6 +236,8 @@ OPTION(mon_daemon_bytes, OPT_U64, 400ul << 20) // mds, osd message memory cap ( OPTION(mon_max_log_entries_per_event, OPT_INT, 4096) OPTION(mon_reweight_min_pgs_per_osd, OPT_U64, 10) // min pgs per osd for reweight-by-pg command OPTION(mon_reweight_min_bytes_per_osd, OPT_U64, 100*1024*1024) // min bytes per osd for reweight-by-utilization command +OPTION(mon_reweight_max_osds, OPT_INT, 4) // max osds to change per reweight-by-* command +OPTION(mon_reweight_max_change, OPT_DOUBLE, 0.05) OPTION(mon_health_data_update_interval, OPT_FLOAT, 60.0) OPTION(mon_health_to_clog, OPT_BOOL, true) OPTION(mon_health_to_clog_interval, OPT_INT, 3600) @@ -497,6 +500,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host +OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it. OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes @@ -1045,6 +1049,8 @@ OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") // OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache +OPTION(rgw_bucket_default_quota_max_objects, OPT_INT, -1) // number of objects allowed +OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in kB OPTION(rgw_expose_bucket, OPT_BOOL, false) // Return the bucket name in the 'Bucket' response header @@ -1054,6 +1060,8 @@ OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT, 180) // time period for acc OPTION(rgw_user_quota_sync_interval, OPT_INT, 3600 * 24) // time period for accumulating modified buckets before syncing entire user stats OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL, false) // whether stats for idle users be fully synced OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between two full stats sync for non-idle users +OPTION(rgw_user_default_quota_max_objects, OPT_INT, -1) // number of objects allowed +OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in kB OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload diff --git a/ceph/src/common/hobject.cc b/ceph/src/common/hobject.cc index 866c9928..c3a1694a 100644 --- a/ceph/src/common/hobject.cc +++ b/ceph/src/common/hobject.cc @@ -195,11 +195,13 @@ ostream& operator<<(ostream& out, const hobject_t& o) { if (o.is_max()) return out << "MAX"; + out << o.pool << '/'; out << std::hex << o.get_hash() << std::dec; + if (o.nspace.length()) + out << ":" << o.nspace; if (o.get_key().length()) out << "." << o.get_key(); out << "/" << o.oid << "/" << o.snap; - out << "/" << o.nspace << "/" << o.pool; return out; } diff --git a/ceph/src/common/obj_bencher.cc b/ceph/src/common/obj_bencher.cc index db4fd8f8..0c246f33 100644 --- a/ceph/src/common/obj_bencher.cc +++ b/ceph/src/common/obj_bencher.cc @@ -61,7 +61,7 @@ ostream& ObjBencher::out(ostream& os, utime_t& t) if (show_time) return t.localtime(os) << " "; else - return os << " "; + return os; } ostream& ObjBencher::out(ostream& os) @@ -86,19 +86,19 @@ void *ObjBencher::status_printer(void *_bencher) { if (i % 20 == 0) { if (i > 0) - cur_time.localtime(cout) << "min lat: " << data.min_latency + cur_time.localtime(cout) << " min lat: " << data.min_latency << " max lat: " << data.max_latency << " avg lat: " << data.avg_latency << std::endl; //I'm naughty and don't reset the fill bencher->out(cout, cur_time) << setfill(' ') - << setw(5) << "sec" - << setw(8) << "Cur ops" - << setw(10) << "started" - << setw(10) << "finished" - << setw(10) << "avg MB/s" - << setw(10) << "cur MB/s" - << setw(10) << "last lat" - << setw(10) << "avg lat" << std::endl; + << setw(5) << "sec" + << setw(8) << "Cur ops" + << setw(10) << "started" + << setw(10) << "finished" + << setw(10) << "avg MB/s" + << setw(10) << "cur MB/s" + << setw(12) << "last lat(s)" + << setw(12) << "avg lat(s)" << std::endl; } if (cycleSinceChange) bandwidth = (double)(data.finished - previous_writes) @@ -122,26 +122,28 @@ void *ObjBencher::status_printer(void *_bencher) { if (previous_writes != data.finished) { previous_writes = data.finished; cycleSinceChange = 0; - bencher->out(cout, cur_time) << setfill(' ') - << setw(5) << i - << setw(8) << data.in_flight - << setw(10) << data.started - << setw(10) << data.finished - << setw(10) << avg_bandwidth - << setw(10) << bandwidth - << setw(10) << (double)data.cur_latency - << setw(10) << data.avg_latency << std::endl; + bencher->out(cout, cur_time) + << setfill(' ') + << setw(5) << i + << ' ' << setw(7) << data.in_flight + << ' ' << setw(9) << data.started + << ' ' << setw(9) << data.finished + << ' ' << setw(9) << avg_bandwidth + << ' ' << setw(9) << bandwidth + << ' ' << setw(11) << (double)data.cur_latency + << ' ' << setw(11) << data.avg_latency << std::endl; } else { - bencher->out(cout, cur_time) << setfill(' ') - << setw(5) << i - << setw(8) << data.in_flight - << setw(10) << data.started - << setw(10) << data.finished - << setw(10) << avg_bandwidth - << setw(10) << '0' - << setw(10) << '-' - << setw(10) << data.avg_latency << std::endl; + bencher->out(cout, cur_time) + << setfill(' ') + << setw(5) << i + << ' ' << setw(7) << data.in_flight + << ' ' << setw(9) << data.started + << ' ' << setw(9) << data.finished + << ' ' << setw(9) << avg_bandwidth + << ' ' << setw(9) << '0' + << ' ' << setw(11) << '-' + << ' '<< setw(11) << data.avg_latency << std::endl; } ++i; ++cycleSinceChange; @@ -453,20 +455,19 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate, double bandwidth; bandwidth = ((double)data.finished)*((double)data.object_size)/(double)timePassed; bandwidth = bandwidth/(1024*1024); // we want it in MB/sec - char bw[20]; - snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth); out(cout) << "Total time run: " << timePassed << std::endl << "Total writes made: " << data.finished << std::endl << "Write size: " << data.object_size << std::endl - << "Bandwidth (MB/sec): " << bw << std::endl + << "Bandwidth (MB/sec): " << setprecision(6) << bandwidth << std::endl << "Stddev Bandwidth: " << vec_stddev(data.history.bandwidth) << std::endl << "Max bandwidth (MB/sec): " << data.idata.max_bandwidth << std::endl << "Min bandwidth (MB/sec): " << data.idata.min_bandwidth << std::endl - << "Average Latency: " << data.avg_latency << std::endl - << "Stddev Latency: " << vec_stddev(data.history.latency) << std::endl - << "Max latency: " << data.max_latency << std::endl - << "Min latency: " << data.min_latency << std::endl; + << "Average IOPS: " << (int)(data.finished/timePassed) << std::endl + << "Average Latency(s): " << data.avg_latency << std::endl + << "Stddev Latency(s): " << vec_stddev(data.history.latency) << std::endl + << "Max latency(s): " << data.max_latency << std::endl + << "Min latency(s): " << data.min_latency << std::endl; //write object size/number data for read benchmarks ::encode(data.object_size, b_write); @@ -598,13 +599,13 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre index[slot] = data.started; lock.Unlock(); completion_wait(slot); + lock.Lock(); r = completion_ret(slot); if (r < 0) { cerr << "read got " << r << std::endl; lock.Unlock(); goto ERR; } - lock.Lock(); total_latency += data.cur_latency; if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency; if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency; @@ -624,14 +625,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre lock.Lock(); ++data.started; ++data.in_flight; - lock.Unlock(); - if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) { - cerr << name[slot] << " is not correct!" << std::endl; - ++errors; - } else { - lock.Unlock(); - } - + lock.Unlock(); name[slot] = newName; } @@ -677,20 +671,19 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre double bandwidth; bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime; bandwidth = bandwidth/(1024*1024); // we want it in MB/sec - char bw[20]; - snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth); - out(cout) << "Total time run: " << runtime << std::endl + out(cout) << "Total time run: " << runtime << std::endl << "Total reads made: " << data.finished << std::endl << "Read size: " << data.object_size << std::endl - << "Bandwidth (MB/sec): " << bw << std::endl - << "Average Latency: " << data.avg_latency << std::endl - << "Max latency: " << data.max_latency << std::endl - << "Min latency: " << data.min_latency << std::endl; + << "Bandwidth (MB/sec): " << setprecision(6) << bandwidth << std::endl + << "Average IOPS: " << (int)(data.finished/runtime) << std::endl + << "Average Latency(s): " << data.avg_latency << std::endl + << "Max latency(s): " << data.max_latency << std::endl + << "Min latency(s): " << data.min_latency << std::endl; completions_done(); - return 0; + return (errors > 0 ? -EIO : 0); ERR: lock.Lock(); @@ -885,20 +878,19 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr double bandwidth; bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime; bandwidth = bandwidth/(1024*1024); // we want it in MB/sec - char bw[20]; - snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth); - out(cout) << "Total time run: " << runtime << std::endl + out(cout) << "Total time run: " << runtime << std::endl << "Total reads made: " << data.finished << std::endl << "Read size: " << data.object_size << std::endl - << "Bandwidth (MB/sec): " << bw << std::endl - << "Average Latency: " << data.avg_latency << std::endl - << "Max latency: " << data.max_latency << std::endl - << "Min latency: " << data.min_latency << std::endl; + << "Bandwidth (MB/sec): " << setprecision(6) << bandwidth << std::endl + << "Average IOPS: " << (int)(data.finished/runtime) << std::endl + << "Average Latency(s): " << data.avg_latency << std::endl + << "Max latency(s): " << data.max_latency << std::endl + << "Min latency(s): " << data.min_latency << std::endl; completions_done(); - return 0; + return (errors > 0 ? -EIO : 0); ERR: lock.Lock(); diff --git a/ceph/src/common/strtol.cc b/ceph/src/common/strtol.cc index 8a43eb56..e3d2df7d 100644 --- a/ceph/src/common/strtol.cc +++ b/ceph/src/common/strtol.cc @@ -14,10 +14,10 @@ #include "strtol.h" -#include -#include +#include +#include +#include #include -#include using std::ostringstream; @@ -126,14 +126,15 @@ float strict_strtof(const char *str, std::string *err) return ret; } -uint64_t strict_sistrtoll(const char *str, std::string *err) +template +T strict_si_cast(const char *str, std::string *err) { std::string s(str); if (s.empty()) { *err = "strict_sistrtoll: value not specified"; return 0; } - const char &u = s.at(s.size()-1); //str[std::strlen(str)-1]; + const char &u = *s.rbegin(); int m = 0; if (u == 'B') m = 0; @@ -152,30 +153,35 @@ uint64_t strict_sistrtoll(const char *str, std::string *err) else m = -1; - const char *v = NULL; if (m >= 0) - s = std::string(str, s.size()-1); - v = s.c_str(); - - long long r_ll = strict_strtoll(v, 10, err); + s.erase(s.size()-1); + else + m = 0; - if (r_ll < 0) { + long long ll = strict_strtoll(s.c_str(), 10, err); + if (ll < 0 && !std::numeric_limits::is_signed) { *err = "strict_sistrtoll: value should not be negative"; return 0; } + if (ll < (long long)std::numeric_limits::min() >> m) { + *err = "strict_sistrtoll: value seems to be too small"; + return 0; + } + if (ll > std::numeric_limits::max() >> m) { + *err = "strict_sistrtoll: value seems to be too large"; + return 0; - uint64_t r = r_ll; - if (err->empty() && m > 0) { - if (r > (std::numeric_limits::max() >> m)) { - *err = "strict_sistrtoll: value seems to be too large"; - return 0; - } - r <<= m; } - return r; + return (ll << m); } -template <> -uint64_t strict_si_cast(const char *str, std::string *err) { - return strict_sistrtoll(str, err); +template int strict_si_cast(const char *str, std::string *err); + +template long long strict_si_cast(const char *str, std::string *err); + +template uint64_t strict_si_cast(const char *str, std::string *err); + +uint64_t strict_sistrtoll(const char *str, std::string *err) +{ + return strict_si_cast(str, err); } diff --git a/ceph/src/common/strtol.h b/ceph/src/common/strtol.h index 5575ed7b..ed865684 100644 --- a/ceph/src/common/strtol.h +++ b/ceph/src/common/strtol.h @@ -31,21 +31,7 @@ float strict_strtof(const char *str, std::string *err); uint64_t strict_sistrtoll(const char *str, std::string *err); -template -Target strict_si_cast(const char *str, std::string *err) { - uint64_t ret = strict_sistrtoll(str, err); - if (!err->empty()) - return ret; - if (ret > (uint64_t)std::numeric_limits::max()) { - err->append("The option value '"); - err->append(str); - err->append("' seems to be too large"); - return 0; - } - return ret; -} - -template <> -uint64_t strict_si_cast(const char *str, std::string *err); +template +T strict_si_cast(const char *str, std::string *err); #endif diff --git a/ceph/src/global/global_init.cc b/ceph/src/global/global_init.cc index 3464b0af..1c34d447 100644 --- a/ceph/src/global/global_init.cc +++ b/ceph/src/global/global_init.cc @@ -151,23 +151,16 @@ void global_print_banner(void) output_ceph_version(); } -static void pidfile_remove_void(void) -{ - pidfile_remove(); -} - -int global_init_prefork(CephContext *cct, int flags) +int global_init_prefork(CephContext *cct, int) { if (g_code_env != CODE_ENVIRONMENT_DAEMON) return -1; + const md_config_t *conf = cct->_conf; if (!conf->daemonize) { - if (atexit(pidfile_remove_void)) { - derr << "global_init_daemonize: failed to set pidfile_remove function " - << "to run at exit." << dendl; - } - pidfile_write(g_conf); + if (pidfile_write(g_conf) < 0) + exit(1); return -1; } @@ -190,7 +183,7 @@ void global_init_daemonize(CephContext *cct, int flags) << cpp_strerror(ret) << dendl; exit(1); } - + global_init_postfork_start(cct); global_init_postfork_finish(cct, flags); } @@ -200,11 +193,6 @@ void global_init_postfork_start(CephContext *cct) // restart log thread g_ceph_context->_log->start(); - if (atexit(pidfile_remove_void)) { - derr << "global_init_daemonize: failed to set pidfile_remove function " - << "to run at exit." << dendl; - } - /* This is the old trick where we make file descriptors 0, 1, and possibly 2 * point to /dev/null. * @@ -228,7 +216,8 @@ void global_init_postfork_start(CephContext *cct) exit(1); } - pidfile_write(g_conf); + if (pidfile_write(g_conf) < 0) + exit(1); } void global_init_postfork_finish(CephContext *cct, int flags) diff --git a/ceph/src/global/pidfile.cc b/ceph/src/global/pidfile.cc index 3b8962a0..f97999f4 100644 --- a/ceph/src/global/pidfile.cc +++ b/ceph/src/global/pidfile.cc @@ -29,70 +29,203 @@ #include "include/compat.h" +// +// derr can be used for functions exclusively called from pidfile_write +// +// cerr must be used for functions called by pidfile_remove because +// logging is not functional when it is called. cerr output is lost +// when the caller is daemonized but it will show if not (-f) +// #define dout_prefix *_dout -static char pid_file[PATH_MAX] = ""; +struct pidfh { + int pf_fd; + char pf_path[PATH_MAX + 1]; + dev_t pf_dev; + ino_t pf_ino; -int pidfile_write(const md_config_t *conf) -{ - int ret, fd; - - if (conf->pid_file.empty()) { - return pidfile_remove(); + pidfh() { + reset(); } - snprintf(pid_file, PATH_MAX, "%s", conf->pid_file.c_str()); - - fd = TEMP_FAILURE_RETRY(::open(pid_file, - O_CREAT|O_TRUNC|O_WRONLY, 0644)); - if (fd < 0) { - int err = errno; - derr << "write_pid_file: failed to open pid file '" - << pid_file << "': " << cpp_strerror(err) << dendl; - return err; + ~pidfh() { + remove(); } - char buf[20]; - int len = snprintf(buf, sizeof(buf), "%d\n", getpid()); - ret = safe_write(fd, buf, len); - if (ret < 0) { - derr << "write_pid_file: failed to write to pid file '" - << pid_file << "': " << cpp_strerror(ret) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return ret; + bool is_open() { + return pf_path[0] != '\0' && pf_fd != -1; } - if (TEMP_FAILURE_RETRY(::close(fd))) { - ret = errno; - derr << "SimpleMessenger::write_pid_file: failed to close to pid file '" - << pid_file << "': " << cpp_strerror(ret) << dendl; - return -ret; + void reset() { + pf_fd = -1; + memset(pf_path, 0, sizeof(pf_path)); + pf_dev = 0; + pf_ino = 0; } + int verify(); + int remove(); + int open(const md_config_t *conf); + int write(); +}; + +static pidfh *pfh = NULL; +int pidfh::verify() { + // check that the file we opened still is the same + if (pf_fd == -1) + return -EINVAL; + struct stat st; + if (stat(pf_path, &st) == -1) + return -errno; + if (st.st_dev != pf_dev || st.st_ino != pf_ino) + return -ESTALE; return 0; } -int pidfile_remove(void) +int pidfh::remove() { - if (!pid_file[0]) + if (!pf_path[0]) return 0; - // only remove it if it has OUR pid in it! - int fd = TEMP_FAILURE_RETRY(::open(pid_file, O_RDONLY)); - if (fd < 0) + int ret; + if ((ret = verify()) < 0) { + if (pf_fd != -1) { + ::close(pf_fd); + reset(); + } + return ret; + } + + // seek to the beginning of the file before reading + ret = ::lseek(pf_fd, 0, SEEK_SET); + if (ret < 0) { + std::cerr << __func__ << " lseek failed " + << cpp_strerror(errno) << std::endl; return -errno; + } + + // check that the pid file still has our pid in it char buf[32]; memset(buf, 0, sizeof(buf)); - ssize_t res = safe_read(fd, buf, sizeof(buf)); - VOID_TEMP_FAILURE_RETRY(::close(fd)); - if (res < 0) + ssize_t res = safe_read(pf_fd, buf, sizeof(buf)); + ::close(pf_fd); + if (res < 0) { + std::cerr << __func__ << " safe_read failed " + << cpp_strerror(-res) << std::endl; return res; + } + int a = atoi(buf); - if (a != getpid()) + if (a != getpid()) { + std::cerr << __func__ << " the pid found in the file is " + << a << " which is different from getpid() " + << getpid() << std::endl; return -EDOM; + } + ret = ::unlink(pf_path); + if (ret < 0) { + std::cerr << __func__ << " unlink " << pf_path << " failed " + << cpp_strerror(errno) << std::endl; + return -errno; + } + reset(); + return 0; +} + +int pidfh::open(const md_config_t *conf) +{ + int len = snprintf(pf_path, sizeof(pf_path), + "%s", conf->pid_file.c_str()); + + if (len >= (int)sizeof(pf_path)) + return -ENAMETOOLONG; + + int fd; + fd = ::open(pf_path, O_CREAT|O_RDWR, 0644); + if (fd < 0) { + int err = errno; + derr << __func__ << ": failed to open pid file '" + << pf_path << "': " << cpp_strerror(err) << dendl; + reset(); + return -err; + } + struct stat st; + if (fstat(fd, &st) == -1) { + int err = errno; + derr << __func__ << ": failed to fstat pid file '" + << pf_path << "': " << cpp_strerror(err) << dendl; + ::close(fd); + reset(); + return -err; + } + + pf_fd = fd; + pf_dev = st.st_dev; + pf_ino = st.st_ino; - res = ::unlink(pid_file); - if (res) + struct flock l = { F_WRLCK, SEEK_SET, 0, 0, 0 }; + int r = ::fcntl(pf_fd, F_SETLK, &l); + if (r < 0) { + derr << __func__ << ": failed to lock pidfile " + << pf_path << " because another process locked it." << dendl; + ::close(pf_fd); + reset(); + return -errno; + } + return 0; +} + +int pidfh::write() +{ + if (!is_open()) + return 0; + + char buf[32]; + int len = snprintf(buf, sizeof(buf), "%d\n", getpid()); + if (::ftruncate(pf_fd, 0) < 0) { + int err = errno; + derr << __func__ << ": failed to ftruncate the pid file '" + << pf_path << "': " << cpp_strerror(err) << dendl; + return err; + } + ssize_t res = safe_write(pf_fd, buf, len); + if (res < 0) { + derr << __func__ << ": failed to write to pid file '" + << pf_path << "': " << cpp_strerror(-res) << dendl; return res; + } + return 0; +} + +void pidfile_remove() +{ + delete pfh; + pfh = NULL; +} + +int pidfile_write(const md_config_t *conf) +{ + if (conf->pid_file.empty()) + return 0; + + assert(!pfh); + + pfh = new pidfh(); + if (atexit(pidfile_remove)) { + derr << __func__ << ": failed to set pidfile_remove function " + << "to run at exit." << dendl; + return -EINVAL; + } + + int r = pfh->open(conf); + if (r != 0) { + pidfile_remove(); + return r; + } + + r = pfh->write(); + if (r != 0) { + pidfile_remove(); + return r; + } - pid_file[0] = '\0'; return 0; } diff --git a/ceph/src/global/pidfile.h b/ceph/src/global/pidfile.h index 6b60a5a1..e7e2b0d4 100644 --- a/ceph/src/global/pidfile.h +++ b/ceph/src/global/pidfile.h @@ -23,6 +23,6 @@ int pidfile_write(const md_config_t *conf); // Remove the pid file that was previously written by pidfile_write. // This is safe to call in a signal handler context. -int pidfile_remove(void); +void pidfile_remove(); #endif diff --git a/ceph/src/include/CompatSet.h b/ceph/src/include/CompatSet.h index 03bf54d5..80abdfbf 100644 --- a/ceph/src/include/CompatSet.h +++ b/ceph/src/include/CompatSet.h @@ -103,7 +103,7 @@ struct CompatSet { for (map::const_iterator p = names.begin(); p != names.end(); ++p) { - char s[10]; + char s[18]; snprintf(s, sizeof(s), "feature_%lld", (unsigned long long)p->first); f->dump_string(s, p->second); } diff --git a/ceph/src/include/ceph_features.h b/ceph/src/include/ceph_features.h index 781df1b3..205e18fb 100644 --- a/ceph/src/include/ceph_features.h +++ b/ceph/src/include/ceph_features.h @@ -64,6 +64,7 @@ // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ #define CEPH_FEATURE_MON_METADATA (1ULL<<50) +#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) /* ... */ #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) @@ -151,6 +152,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_MDS_QUOTA | \ CEPH_FEATURE_CRUSH_V4 | \ CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \ + CEPH_FEATURE_OSD_HITSET_GMT | \ CEPH_FEATURE_HAMMER_0_94_4 | \ 0ULL) diff --git a/ceph/src/init-ceph.in b/ceph/src/init-ceph.in index f7acccaf..e4cea7ef 100644 --- a/ceph/src/init-ceph.in +++ b/ceph/src/init-ceph.in @@ -12,7 +12,10 @@ # Description: Enable Ceph distributed file system services. ### END INIT INFO -. /lib/lsb/init-functions +# TODO: on FreeBSD/OSX, use equivalent script file +if [ -e /lib/lsb/init-functions ]; then + . /lib/lsb/init-functions +fi # detect systemd, also check whether the systemd-run binary exists SYSTEMD_RUN=$(which systemd-run 2>/dev/null) diff --git a/ceph/src/librbd/AioCompletion.cc b/ceph/src/librbd/AioCompletion.cc index 6222531b..efbc8494 100644 --- a/ceph/src/librbd/AioCompletion.cc +++ b/ceph/src/librbd/AioCompletion.cc @@ -177,9 +177,10 @@ namespace librbd { void C_CacheRead::complete(int r) { if (!m_enqueued) { // cache_lock creates a lock ordering issue -- so re-execute this context - // outside the cache_lock + // outside the cache_lock. use the writeback handler's dedicated thread + // to avoid blocking a dependent operation m_enqueued = true; - m_image_ctx.op_work_queue->queue(this, r); + m_image_ctx.writeback_handler->queue(this, r); return; } Context::complete(r); diff --git a/ceph/src/librbd/AsyncOperation.cc b/ceph/src/librbd/AsyncOperation.cc index dfb1e61a..2402b487 100644 --- a/ceph/src/librbd/AsyncOperation.cc +++ b/ceph/src/librbd/AsyncOperation.cc @@ -3,6 +3,7 @@ #include "librbd/AsyncOperation.h" #include "librbd/ImageCtx.h" #include "common/dout.h" +#include "common/WorkQueue.h" #include "include/assert.h" #define dout_subsys ceph_subsys_rbd diff --git a/ceph/src/librbd/ImageCtx.cc b/ceph/src/librbd/ImageCtx.cc index 1574c568..3a47565e 100644 --- a/ceph/src/librbd/ImageCtx.cc +++ b/ceph/src/librbd/ImageCtx.cc @@ -48,6 +48,89 @@ public: } }; +struct C_FlushCache : public Context { + ImageCtx *image_ctx; + Context *on_safe; + + C_FlushCache(ImageCtx *_image_ctx, Context *_on_safe) + : image_ctx(_image_ctx), on_safe(_on_safe) { + } + virtual void finish(int r) { + // successful cache flush indicates all IO is now safe + RWLock::RLocker owner_locker(image_ctx->owner_lock); + image_ctx->flush_cache(on_safe); + } +}; + +struct C_InvalidateCache : public Context { + ImageCtx *image_ctx; + bool purge_on_error; + bool reentrant_safe; + Context *on_finish; + + C_InvalidateCache(ImageCtx *_image_ctx, bool _purge_on_error, + bool _reentrant_safe, Context *_on_finish) + : image_ctx(_image_ctx), purge_on_error(_purge_on_error), + reentrant_safe(_reentrant_safe), on_finish(_on_finish) { + } + virtual void finish(int r) { + assert(image_ctx->cache_lock.is_locked()); + CephContext *cct = image_ctx->cct; + + if (r == -EBLACKLISTED) { + lderr(cct) << "Blacklisted during flush! Purging cache..." << dendl; + image_ctx->object_cacher->purge_set(image_ctx->object_set); + } else if (r != 0 && purge_on_error) { + lderr(cct) << "invalidate cache encountered error " + << cpp_strerror(r) << " !Purging cache..." << dendl; + image_ctx->object_cacher->purge_set(image_ctx->object_set); + } else if (r != 0) { + lderr(cct) << "flush_cache returned " << r << dendl; + } + + loff_t unclean = image_ctx->object_cacher->release_set( + image_ctx->object_set); + if (unclean == 0) { + r = 0; + } else { + lderr(cct) << "could not release all objects from cache: " + << unclean << " bytes remain" << dendl; + r = -EBUSY; + } + + if (reentrant_safe) { + on_finish->complete(r); + } else { + image_ctx->op_work_queue->queue(on_finish, r); + } + } + +}; + +struct C_AsyncCallback : public Context { + ImageCtx *image_ctx; + Context *on_finish; + C_AsyncCallback(ImageCtx *image_ctx, Context *on_finish) + : image_ctx(image_ctx), on_finish(on_finish) { + } + virtual void finish(int r) { + image_ctx->op_work_queue->queue(on_finish, r); + } +}; + +void _flush_async_operations(ImageCtx *ictx, Context *on_finish) { + { + Mutex::Locker async_ops_locker(ictx->async_ops_lock); + if (!ictx->async_ops.empty()) { + ldout(ictx->cct, 20) << "flush async operations: " << on_finish << " " + << "count=" << ictx->async_ops.size() << dendl; + ictx->async_ops.front()->add_flush_context(on_finish); + return; + } + } + on_finish->complete(0); +} + } // anonymous namespace ImageCtx::ImageCtx(const string &image_name, const string &image_id, @@ -163,6 +246,11 @@ public: } delete[] format_string; + md_ctx.aio_flush(); + data_ctx.aio_flush(); + op_work_queue->drain(); + aio_work_queue->drain(); + delete op_work_queue; delete aio_work_queue; } @@ -655,53 +743,47 @@ public: } } - void ImageCtx::flush_cache_aio(Context *onfinish) { + int ImageCtx::flush_cache() { + C_SaferCond cond_ctx; + flush_cache(&cond_ctx); + + ldout(cct, 20) << "waiting for cache to be flushed" << dendl; + int r = cond_ctx.wait(); + ldout(cct, 20) << "finished flushing cache" << dendl; + + return r; + } + + void ImageCtx::flush_cache(Context *onfinish) { assert(owner_lock.is_locked()); cache_lock.Lock(); object_cacher->flush_set(object_set, onfinish); cache_lock.Unlock(); } - int ImageCtx::flush_cache() { - int r = 0; - Mutex mylock("librbd::ImageCtx::flush_cache"); - Cond cond; - bool done; - Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r); - flush_cache_aio(onfinish); - mylock.Lock(); - while (!done) { - ldout(cct, 20) << "waiting for cache to be flushed" << dendl; - cond.Wait(mylock); - } - mylock.Unlock(); - ldout(cct, 20) << "finished flushing cache" << dendl; - return r; - } - - void ImageCtx::shutdown_cache() { + int ImageCtx::shutdown_cache() { flush_async_operations(); RWLock::RLocker owner_locker(owner_lock); - invalidate_cache(true); + int r = invalidate_cache(true); object_cacher->stop(); + return r; } int ImageCtx::invalidate_cache(bool purge_on_error) { - int result; - C_SaferCond ctx; - invalidate_cache(&ctx); - result = ctx.wait(); - - if (result && purge_on_error) { - cache_lock.Lock(); - if (object_cacher != NULL) { - lderr(cct) << "invalidate cache met error " << cpp_strerror(result) << " !Purging cache..." << dendl; - object_cacher->purge_set(object_set); - } - cache_lock.Unlock(); + flush_async_operations(); + if (object_cacher == NULL) { + return 0; } + cache_lock.Lock(); + object_cacher->release_set(object_set); + cache_lock.Unlock(); + + C_SaferCond ctx; + flush_cache(new C_InvalidateCache(this, purge_on_error, true, &ctx)); + + int result = ctx.wait(); return result; } @@ -715,29 +797,7 @@ public: object_cacher->release_set(object_set); cache_lock.Unlock(); - flush_cache_aio(new FunctionContext(boost::bind( - &ImageCtx::invalidate_cache_completion, this, _1, on_finish))); - } - - void ImageCtx::invalidate_cache_completion(int r, Context *on_finish) { - assert(cache_lock.is_locked()); - if (r == -EBLACKLISTED) { - lderr(cct) << "Blacklisted during flush! Purging cache..." << dendl; - object_cacher->purge_set(object_set); - } else if (r != 0) { - lderr(cct) << "flush_cache returned " << r << dendl; - } - - loff_t unclean = object_cacher->release_set(object_set); - if (unclean == 0) { - r = 0; - } else { - lderr(cct) << "could not release all objects from cache: " - << unclean << " bytes remain" << dendl; - r = -EBUSY; - } - - op_work_queue->queue(on_finish, r); + flush_cache(new C_InvalidateCache(this, false, false, on_finish)); } void ImageCtx::clear_nonexistence_cache() { @@ -800,20 +860,35 @@ public: void ImageCtx::flush_async_operations() { C_SaferCond ctx; - flush_async_operations(&ctx); + _flush_async_operations(this, &ctx); ctx.wait(); } void ImageCtx::flush_async_operations(Context *on_finish) { - Mutex::Locker l(async_ops_lock); - if (async_ops.empty()) { - on_finish->complete(0); - return; + // complete context in clean thread context + _flush_async_operations(this, new C_AsyncCallback(this, on_finish)); + } + + int ImageCtx::flush() { + assert(owner_lock.is_locked()); + + flush_async_operations(); + if (object_cacher != NULL) { + int r = flush_cache(); + if (r < 0) { + return r; + } } + return 0; + } - ldout(cct, 20) << "flush async operations: " << on_finish << " " - << "count=" << async_ops.size() << dendl; - async_ops.front()->add_flush_context(on_finish); + void ImageCtx::flush(Context *on_safe) { + assert(owner_lock.is_locked()); + if (object_cacher != NULL) { + // flush cache after completing all in-flight AIO ops + on_safe = new C_FlushCache(this, on_safe); + } + flush_async_operations(on_safe); } void ImageCtx::cancel_async_requests() { diff --git a/ceph/src/librbd/ImageCtx.h b/ceph/src/librbd/ImageCtx.h index 238b0ab6..5fa0ee6c 100644 --- a/ceph/src/librbd/ImageCtx.h +++ b/ceph/src/librbd/ImageCtx.h @@ -192,12 +192,11 @@ namespace librbd { void write_to_cache(object_t o, const bufferlist& bl, size_t len, uint64_t off, Context *onfinish, int fadvise_flags); void user_flushed(); - void flush_cache_aio(Context *onfinish); int flush_cache(); - void shutdown_cache(); + void flush_cache(Context *onfinish); + int shutdown_cache(); int invalidate_cache(bool purge_on_error=false); void invalidate_cache(Context *on_finish); - void invalidate_cache_completion(int r, Context *on_finish); void clear_nonexistence_cache(); int register_watch(); void unregister_watch(); @@ -209,6 +208,9 @@ namespace librbd { void flush_async_operations(); void flush_async_operations(Context *on_finish); + int flush(); + void flush(Context *on_safe); + void cancel_async_requests(); }; } diff --git a/ceph/src/librbd/ImageWatcher.cc b/ceph/src/librbd/ImageWatcher.cc index 790a0366..4c76dda8 100644 --- a/ceph/src/librbd/ImageWatcher.cc +++ b/ceph/src/librbd/ImageWatcher.cc @@ -391,7 +391,7 @@ bool ImageWatcher::release_lock() { RWLock::RLocker owner_locker(m_image_ctx.owner_lock); RWLock::WLocker md_locker(m_image_ctx.md_lock); - librbd::_flush(&m_image_ctx); + m_image_ctx.flush(); } m_image_ctx.owner_lock.get_write(); diff --git a/ceph/src/librbd/LibrbdWriteback.cc b/ceph/src/librbd/LibrbdWriteback.cc index ac778eec..ee14f001 100644 --- a/ceph/src/librbd/LibrbdWriteback.cc +++ b/ceph/src/librbd/LibrbdWriteback.cc @@ -102,6 +102,10 @@ namespace librbd { delete m_finisher; } + void LibrbdWriteback::queue(Context *ctx, int r) { + m_finisher->queue(ctx, r); + } + void LibrbdWriteback::read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, @@ -114,7 +118,7 @@ namespace librbd { { if (!m_ictx->object_map.object_may_exist(object_no)) { - m_finisher->queue(req, -ENOENT); + queue(req, -ENOENT); return; } } diff --git a/ceph/src/librbd/LibrbdWriteback.h b/ceph/src/librbd/LibrbdWriteback.h index b5578ae6..5b65504c 100644 --- a/ceph/src/librbd/LibrbdWriteback.h +++ b/ceph/src/librbd/LibrbdWriteback.h @@ -23,6 +23,8 @@ namespace librbd { LibrbdWriteback(ImageCtx *ictx, Mutex& lock); virtual ~LibrbdWriteback(); + void queue(Context *ctx, int r); + // Note that oloc, trunc_size, and trunc_seq are ignored virtual void read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, diff --git a/ceph/src/librbd/internal.cc b/ceph/src/librbd/internal.cc index b4b4d9e4..f4b110ee 100644 --- a/ceph/src/librbd/internal.cc +++ b/ceph/src/librbd/internal.cc @@ -413,6 +413,48 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) { rados_completion->release(); } + int rollback_parent(ImageCtx *ictx, uint64_t snap_id) + { + assert(ictx); + assert(ictx->parent_lock.is_locked()); + assert(ictx->snap_lock.is_locked()); + + CephContext *cct = ictx->cct; + int r = 0; + std::map::const_iterator it = ictx->snap_info.find(snap_id); + if (it == ictx->snap_info.end()) { + ldout(cct, 10) << __func__ << ": no such snapshot: " << snap_id << dendl; + return -ENOENT; + } + const SnapInfo& snap_info(it->second); + if (ictx->parent_md == snap_info.parent) { + ldout(cct, 20) << __func__ << ": nop: head and snapshot have the same parent" << dendl; + return 0; + } + if (ictx->parent_md.spec.pool_id != -1) { + // remove the old parent link first, otherwise cls_client::set_parent + // will fail with -EEXISTS + ldout(cct, 20) << __func__ << ": removing the old parent link" << dendl; + r = cls_client::remove_parent(&ictx->md_ctx, ictx->header_oid); + if (r < 0) { + ldout(cct, 10) << __func__ << ": failed to remove parent link: " + << cpp_strerror(r) << dendl; + return r; + } + } + if (snap_info.parent.spec.pool_id != -1) { + ldout(cct, 20) << __func__ << ": updating the parent link" << dendl; + r = cls_client::set_parent(&ictx->md_ctx, ictx->header_oid, + snap_info.parent.spec, snap_info.parent.overlap); + if (r < 0) { + ldout(cct, 10) << __func__ << ": failed to set parent link: " + << cpp_strerror(r) << dendl; + return r; + } + } + return 0; + } + int rollback_image(ImageCtx *ictx, uint64_t snap_id, ProgressContext& prog_ctx) { @@ -444,6 +486,17 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) { RWLock::WLocker l(ictx->snap_lock); ictx->object_map.rollback(snap_id); } + + { + RWLock::WLocker snap_locker(ictx->snap_lock); + RWLock::WLocker parent_locker(ictx->parent_lock); + r = rollback_parent(ictx, snap_id); + if (r < 0) { + ldout(cct, 10) << __func__ << ": failed to rollback the parent link: " + << cpp_strerror(r) << dendl; + return r; + } + } return 0; } @@ -612,7 +665,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) { } RWLock::WLocker md_locker(ictx->md_lock); - r = _flush(ictx); + r = ictx->flush(); if (r < 0) { return r; } @@ -2164,7 +2217,7 @@ reprotect_and_return_err: } // release snap_lock and cache_lock if (new_snap) { - _flush(ictx); + ictx->flush(); } ictx->refresh_lock.Lock(); @@ -2224,7 +2277,6 @@ reprotect_and_return_err: // writes might create new snapshots. Rolling back will replace // the current version, so we have to invalidate that too. RWLock::WLocker md_locker(ictx->md_lock); - ictx->flush_async_operations(); r = ictx->invalidate_cache(); if (r < 0) { return r; @@ -2446,7 +2498,7 @@ reprotect_and_return_err: // get -EROFS for writes RWLock::RLocker owner_locker(ictx->owner_lock); RWLock::WLocker md_locker(ictx->md_lock); - ictx->flush_cache(); + ictx->flush(); } int r = _snap_set(ictx, snap_name); if (r < 0) { @@ -2893,7 +2945,7 @@ reprotect_and_return_err: // ensure previous writes are visible to listsnaps { RWLock::RLocker owner_locker(ictx->owner_lock); - _flush(ictx); + ictx->flush(); } int r = ictx_check(ictx); @@ -3275,19 +3327,9 @@ reprotect_and_return_err: C_AioWrite *flush_ctx = new C_AioWrite(cct, c); c->add_request(); - ictx->flush_async_operations(flush_ctx); + ictx->flush(flush_ctx); c->init_time(ictx, AIO_TYPE_FLUSH); - C_AioWrite *req_comp = new C_AioWrite(cct, c); - c->add_request(); - if (ictx->object_cacher) { - ictx->flush_cache_aio(req_comp); - } else { - librados::AioCompletion *rados_completion = - librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb); - ictx->data_ctx.aio_flush_async(rados_completion); - rados_completion->release(); - } c->finish_adding_requests(cct); c->put(); ictx->perfcounter->inc(l_librbd_aio_flush); @@ -3306,31 +3348,12 @@ reprotect_and_return_err: ictx->user_flushed(); { RWLock::RLocker owner_locker(ictx->owner_lock); - r = _flush(ictx); + r = ictx->flush(); } ictx->perfcounter->inc(l_librbd_flush); return r; } - int _flush(ImageCtx *ictx) - { - assert(ictx->owner_lock.is_locked()); - CephContext *cct = ictx->cct; - int r; - // flush any outstanding writes - if (ictx->object_cacher) { - r = ictx->flush_cache(); - } else { - r = ictx->data_ctx.aio_flush(); - ictx->flush_async_operations(); - } - - if (r) - lderr(cct) << "_flush " << ictx << " r = " << r << dendl; - - return r; - } - int invalidate_cache(ImageCtx *ictx) { CephContext *cct = ictx->cct; @@ -3341,8 +3364,6 @@ reprotect_and_return_err: return r; } - ictx->flush_async_operations(); - RWLock::RLocker owner_locker(ictx->owner_lock); RWLock::WLocker md_locker(ictx->md_lock); r = ictx->invalidate_cache(); diff --git a/ceph/src/librbd/internal.h b/ceph/src/librbd/internal.h index a633c9d2..b0b882b4 100644 --- a/ceph/src/librbd/internal.h +++ b/ceph/src/librbd/internal.h @@ -207,7 +207,6 @@ namespace librbd { char *buf, bufferlist *pbl, AioCompletion *c, int op_flags); void aio_flush(ImageCtx *ictx, AioCompletion *c); int flush(ImageCtx *ictx); - int _flush(ImageCtx *ictx); int invalidate_cache(ImageCtx *ictx); ssize_t handle_sparse_read(CephContext *cct, diff --git a/ceph/src/librbd/parent_types.h b/ceph/src/librbd/parent_types.h index 4dcc4529..de7e6129 100644 --- a/ceph/src/librbd/parent_types.h +++ b/ceph/src/librbd/parent_types.h @@ -14,12 +14,12 @@ namespace librbd { parent_spec() : pool_id(-1), snap_id(CEPH_NOSNAP) {} parent_spec(uint64_t pool_id, string image_id, snapid_t snap_id) : pool_id(pool_id), image_id(image_id), snap_id(snap_id) {} - bool operator==(const parent_spec &other) { + bool operator==(const parent_spec &other) const { return ((this->pool_id == other.pool_id) && (this->image_id == other.image_id) && (this->snap_id == other.snap_id)); } - bool operator!=(const parent_spec &other) { + bool operator!=(const parent_spec &other) const { return !(*this == other); } }; @@ -28,6 +28,12 @@ namespace librbd { parent_spec spec; uint64_t overlap; parent_info() : overlap(0) {} + bool operator==(const parent_info &other) const { + return (spec == other.spec) && (overlap == other.overlap); + } + bool operator!=(const parent_info &other) const { + return (spec != other.spec) || (overlap != other.overlap); + } }; } diff --git a/ceph/src/log/Log.cc b/ceph/src/log/Log.cc index a3e54dfa..075f9173 100644 --- a/ceph/src/log/Log.cc +++ b/ceph/src/log/Log.cc @@ -41,6 +41,7 @@ Log::Log(SubsystemMap *s) m_flush_mutex_holder(0), m_new(), m_recent(), m_fd(-1), + m_fd_last_error(0), m_syslog_log(-2), m_syslog_crash(-2), m_stderr_log(1), m_stderr_crash(-1), m_stop(false), @@ -236,8 +237,13 @@ void Log::_flush(EntryQueue *t, EntryQueue *requeue, bool crash) r = safe_write(m_fd, s.data(), s.size()); if (r >= 0) r = write(m_fd, "\n", 1); - if (r < 0) - cerr << "problem writing to " << m_log_file << ": " << cpp_strerror(r) << std::endl; + if (r != m_fd_last_error) { + if (r < 0) + cerr << "problem writing to " << m_log_file + << ": " << cpp_strerror(r) + << std::endl; + m_fd_last_error = r; + } } if (do_syslog) { diff --git a/ceph/src/log/Log.h b/ceph/src/log/Log.h index 04cadd72..efa520f8 100644 --- a/ceph/src/log/Log.h +++ b/ceph/src/log/Log.h @@ -35,6 +35,8 @@ class Log : private Thread std::string m_log_file; int m_fd; + int m_fd_last_error; ///< last error we say writing to fd (if any) + int m_syslog_log, m_syslog_crash; int m_stderr_log, m_stderr_crash; diff --git a/ceph/src/logrotate.conf b/ceph/src/logrotate.conf index 9ae27bae..df31e1d7 100644 --- a/ceph/src/logrotate.conf +++ b/ceph/src/logrotate.conf @@ -23,6 +23,7 @@ done done fi + killall -q -1 ceph-fuse || true endscript missingok notifempty diff --git a/ceph/src/mds/MDCache.cc b/ceph/src/mds/MDCache.cc index f62afae4..65847ba2 100644 --- a/ceph/src/mds/MDCache.cc +++ b/ceph/src/mds/MDCache.cc @@ -9113,27 +9113,26 @@ void MDCache::truncate_stray(CDentry *dn) dout(10) << " realm " << *realm << dendl; const SnapContext *snapc = &realm->get_snap_context(); - uint64_t period = (uint64_t)in->inode.layout.fl_object_size * - (uint64_t)in->inode.layout.fl_stripe_count; uint64_t to = in->inode.get_max_size(); to = MAX(in->inode.size, to); // when truncating a file, the filer does not delete stripe objects that are // truncated to zero. so we need to purge stripe objects up to the max size // the file has ever been. to = MAX(in->inode.max_size_ever, to); - if (period && to > period) { - uint64_t num = (to - 1) / period; + if (to > 0) { + uint64_t num = Striper::get_num_objects(in->inode.layout, to); dout(10) << "purge_stray 0~" << to << " objects 0~" << num - << " snapc " << snapc << " on " << *in << dendl; - mds->filer->purge_range(in->ino(), &in->inode.layout, *snapc, - 1, num, ceph_clock_now(g_ceph_context), - 0, gather.new_sub()); - } + << " snapc " << snapc << " on " << *in << dendl; - // keep backtrace object - if (period && to > 0) { + // keep backtrace object + if (num > 1) { + mds->filer->purge_range(in->ino(), &in->inode.layout, *snapc, + 1, num - 1, ceph_clock_now(g_ceph_context), + 0, gather.new_sub()); + } mds->filer->zero(in->ino(), &in->inode.layout, *snapc, - 0, period, ceph_clock_now(g_ceph_context), + 0, (uint64_t)in->inode.layout.fl_object_size, + ceph_clock_now(g_ceph_context), 0, true, NULL, gather.new_sub()); } @@ -9205,16 +9204,14 @@ void MDCache::purge_stray(CDentry *dn) } if (in->is_file()) { - uint64_t period = (uint64_t)in->inode.layout.fl_object_size * - (uint64_t)in->inode.layout.fl_stripe_count; uint64_t to = in->inode.get_max_size(); to = MAX(in->inode.size, to); // when truncating a file, the filer does not delete stripe objects that are // truncated to zero. so we need to purge stripe objects up to the max size // the file has ever been. to = MAX(in->inode.max_size_ever, to); - if (to && period) { - uint64_t num = (to + period - 1) / period; + if (to > 0) { + uint64_t num = Striper::get_num_objects(in->inode.layout, to); dout(10) << "purge_stray 0~" << to << " objects 0~" << num << " snapc " << snapc << " on " << *in << dendl; mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc, diff --git a/ceph/src/mds/MDSMap.cc b/ceph/src/mds/MDSMap.cc index 831e236b..7c7d1eda 100644 --- a/ceph/src/mds/MDSMap.cc +++ b/ceph/src/mds/MDSMap.cc @@ -122,7 +122,7 @@ void MDSMap::dump(Formatter *f) const f->close_section(); f->open_object_section("up"); for (map::const_iterator p = up.begin(); p != up.end(); ++p) { - char s[10]; + char s[14]; sprintf(s, "mds_%d", int(p->first)); f->dump_int(s, p->second); } diff --git a/ceph/src/mds/Server.cc b/ceph/src/mds/Server.cc index d4ed0723..526cfdb8 100644 --- a/ceph/src/mds/Server.cc +++ b/ceph/src/mds/Server.cc @@ -2732,9 +2732,13 @@ void Server::handle_client_open(MDRequestRef& mdr) return; } - // can only open a dir with mode FILE_MODE_PIN, at least for now. - if (cur->inode.is_dir()) + if (!cur->inode.is_file()) { + // can only open non-regular inode with mode FILE_MODE_PIN, at least for now. cmode = CEPH_FILE_MODE_PIN; + // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag. + if (cur->inode.is_symlink() && !(flags & O_NOFOLLOW)) + flags &= ~O_TRUNC; + } dout(10) << "open flags = " << flags << ", filemode = " << cmode @@ -2747,12 +2751,19 @@ void Server::handle_client_open(MDRequestRef& mdr) respond_to_request(mdr, -ENXIO); // FIXME what error do we want? return; }*/ - if ((req->head.args.open.flags & O_DIRECTORY) && !cur->inode.is_dir()) { + if ((req->head.args.open.flags & O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) { dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl; respond_to_request(mdr, -EINVAL); return; } + if ((flags & O_TRUNC) && !cur->inode.is_file()) { + dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl; + // we should return -EISDIR for directory, return -EINVAL for other non-regular + respond_to_request(mdr, cur->inode.is_dir() ? EISDIR : -EINVAL); + return; + } + if (cur->inode.inline_version != CEPH_INLINE_NONE && !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { dout(7) << "old client cannot open inline data file " << *cur << dendl; diff --git a/ceph/src/mon/ConfigKeyService.cc b/ceph/src/mon/ConfigKeyService.cc index 97126ed0..210ff7c2 100644 --- a/ceph/src/mon/ConfigKeyService.cc +++ b/ceph/src/mon/ConfigKeyService.cc @@ -45,6 +45,11 @@ int ConfigKeyService::store_get(string key, bufferlist &bl) return mon->store->get(STORE_PREFIX, key, bl); } +void ConfigKeyService::get_store_prefixes(set& s) +{ + s.insert(STORE_PREFIX); +} + void ConfigKeyService::store_put(string key, bufferlist &bl, Context *cb) { bufferlist proposal_bl; diff --git a/ceph/src/mon/ConfigKeyService.h b/ceph/src/mon/ConfigKeyService.h index e33070b6..80b313ef 100644 --- a/ceph/src/mon/ConfigKeyService.h +++ b/ceph/src/mon/ConfigKeyService.h @@ -69,7 +69,7 @@ public: virtual string get_name() const { return "config_key"; } - + virtual void get_store_prefixes(set& s); /** * @} // ConfigKeyService_Inherited_h */ diff --git a/ceph/src/mon/LogMonitor.cc b/ceph/src/mon/LogMonitor.cc index aa1f674e..a9f2ceb5 100644 --- a/ceph/src/mon/LogMonitor.cc +++ b/ceph/src/mon/LogMonitor.cc @@ -138,7 +138,7 @@ void LogMonitor::update_from_paxos(bool *need_bootstrap) if (channels.do_log_to_syslog(channel)) { string level = channels.get_level(channel); - string facility = channels.get_facility(facility); + string facility = channels.get_facility(channel); if (level.empty() || facility.empty()) { derr << __func__ << " unable to log to syslog -- level or facility" << " not defined (level: " << level << ", facility: " diff --git a/ceph/src/mon/MDSMonitor.cc b/ceph/src/mon/MDSMonitor.cc index 95be55eb..5aa8866f 100644 --- a/ceph/src/mon/MDSMonitor.cc +++ b/ceph/src/mon/MDSMonitor.cc @@ -754,7 +754,7 @@ bool MDSMonitor::preprocess_command(MMonCommand *m) if (err == -ENOENT) { r = -ENOENT; } else { - assert(r == 0); + assert(err == 0); assert(b.length()); MDSMap mm; mm.decode(b); diff --git a/ceph/src/mon/MonClient.cc b/ceph/src/mon/MonClient.cc index 9c9a3e7a..07d97810 100644 --- a/ceph/src/mon/MonClient.cc +++ b/ceph/src/mon/MonClient.cc @@ -410,6 +410,11 @@ void MonClient::shutdown() waiting_for_session.pop_front(); } + if (cur_con) + cur_con->mark_down(); + cur_con.reset(NULL); + cur_mon.clear(); + monc_lock.Unlock(); if (initialized) { @@ -418,11 +423,6 @@ void MonClient::shutdown() monc_lock.Lock(); timer.shutdown(); - if (cur_con) - cur_con->mark_down(); - cur_con.reset(NULL); - cur_mon.clear(); - monc_lock.Unlock(); } @@ -518,6 +518,7 @@ void MonClient::handle_auth(MAuthReply *m) if (ret == 0) { if (state != MC_STATE_HAVE_SESSION) { state = MC_STATE_HAVE_SESSION; + last_rotating_renew_sent = utime_t(); while (!waiting_for_session.empty()) { _send_mon_message(waiting_for_session.front()); waiting_for_session.pop_front(); @@ -802,8 +803,11 @@ int MonClient::_check_auth_rotating() return 0; } - utime_t cutoff = ceph_clock_now(cct); + utime_t now = ceph_clock_now(cct); + utime_t cutoff = now; cutoff -= MIN(30.0, cct->_conf->auth_service_ticket_ttl / 4.0); + utime_t issued_at_lower_bound = now; + issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl; if (!rotating_secrets->need_new_secrets(cutoff)) { ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl; rotating_secrets->dump_rotating(); @@ -811,9 +815,22 @@ int MonClient::_check_auth_rotating() } ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl; + if (!rotating_secrets->need_new_secrets() && + rotating_secrets->need_new_secrets(issued_at_lower_bound)) { + // the key has expired before it has been issued? + lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early" + << " (before " << issued_at_lower_bound << ")" << dendl; + } + if ((now > last_rotating_renew_sent) && + double(now - last_rotating_renew_sent) < 1) { + ldout(cct, 10) << __func__ << " called too often (last: " + << last_rotating_renew_sent << "), skipping refresh" << dendl; + return 0; + } MAuth *m = new MAuth; m->protocol = auth->get_protocol(); if (auth->build_rotating_request(m->auth_payload)) { + last_rotating_renew_sent = now; _send_mon_message(m); } else { m->put(); @@ -824,7 +841,8 @@ int MonClient::_check_auth_rotating() int MonClient::wait_auth_rotating(double timeout) { Mutex::Locker l(monc_lock); - utime_t until = ceph_clock_now(cct); + utime_t now = ceph_clock_now(cct); + utime_t until = now; until += timeout; if (auth->get_protocol() == CEPH_AUTH_NONE) @@ -834,14 +852,14 @@ int MonClient::wait_auth_rotating(double timeout) return 0; while (auth_principal_needs_rotating_keys(entity_name) && - rotating_secrets->need_new_secrets()) { - utime_t now = ceph_clock_now(cct); + rotating_secrets->need_new_secrets(now)) { if (now >= until) { ldout(cct, 0) << "wait_auth_rotating timed out after " << timeout << dendl; return -ETIMEDOUT; } ldout(cct, 10) << "wait_auth_rotating waiting (until " << until << ")" << dendl; auth_cond.WaitUntil(monc_lock, until); + now = ceph_clock_now(cct); } ldout(cct, 10) << "wait_auth_rotating done" << dendl; return 0; diff --git a/ceph/src/mon/MonClient.h b/ceph/src/mon/MonClient.h index 239d91b4..ced77e02 100644 --- a/ceph/src/mon/MonClient.h +++ b/ceph/src/mon/MonClient.h @@ -179,6 +179,7 @@ private: int authenticate_err; list waiting_for_session; + utime_t last_rotating_renew_sent; Context *session_established_context; bool had_a_connection; double reopen_interval_multiplier; diff --git a/ceph/src/mon/MonCommands.h b/ceph/src/mon/MonCommands.h index a75b0673..3f0dae85 100644 --- a/ceph/src/mon/MonCommands.h +++ b/ceph/src/mon/MonCommands.h @@ -634,7 +634,7 @@ COMMAND("osd pool get " \ "get pool parameter ", "osd", "r", "cli,rest") COMMAND("osd pool set " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \ "name=val,type=CephString " \ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \ "set pool parameter to ", "osd", "rw", "cli,rest") @@ -654,15 +654,37 @@ COMMAND("osd pool stats " \ "name=name,type=CephString,req=false", "obtain stats from all pools, or from specified pool", "osd", "r", "cli,rest") +COMMAND("osd utilization", + "get basic pg distribution stats", + "osd", "r", "cli,rest") COMMAND("osd reweight-by-utilization " \ - "name=oload,type=CephInt,range=100,req=false", \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\ "reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ "osd", "rw", "cli,rest") +COMMAND("osd test-reweight-by-utilization " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\ + "dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ + "osd", "rw", "cli,rest") COMMAND("osd reweight-by-pg " \ - "name=oload,type=CephInt,range=100 " \ - "name=pools,type=CephPoolname,n=N,req=false", \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ "reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ "osd", "rw", "cli,rest") +COMMAND("osd test-reweight-by-pg " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ + "dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ + "osd", "rw", "cli,rest") COMMAND("osd thrash " \ "name=num_epochs,type=CephInt,range=0", \ "thrash OSDs for ", "osd", "rw", "cli,rest") diff --git a/ceph/src/mon/Monitor.cc b/ceph/src/mon/Monitor.cc index 4a34283e..d499f0c2 100644 --- a/ceph/src/mon/Monitor.cc +++ b/ceph/src/mon/Monitor.cc @@ -178,6 +178,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s, timecheck_round(0), timecheck_acks(0), + timecheck_rounds_since_clean(0), timecheck_event(NULL), probe_timeout_event(NULL), @@ -1021,7 +1022,9 @@ set Monitor::get_sync_targets_names() targets.insert(paxos->get_name()); for (int i = 0; i < PAXOS_NUM; ++i) paxos_service[i]->get_store_prefixes(targets); - + ConfigKeyService *config_key_service_ptr = dynamic_cast(config_key_service); + assert(config_key_service_ptr); + config_key_service_ptr->get_store_prefixes(targets); return targets; } @@ -1692,7 +1695,7 @@ void Monitor::handle_probe_reply(MMonProbe *m) } else { if (paxos->get_version() < m->paxos_first_version && m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1. - dout(10) << " peer paxos versions [" << m->paxos_first_version + dout(10) << " peer paxos first versions [" << m->paxos_first_version << "," << m->paxos_last_version << "]" << " vs my version " << paxos->get_version() << " (too far ahead)" @@ -1703,7 +1706,7 @@ void Monitor::handle_probe_reply(MMonProbe *m) return; } if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) { - dout(10) << " peer paxos version " << m->paxos_last_version + dout(10) << " peer paxos last version " << m->paxos_last_version << " vs my version " << paxos->get_version() << " (too far ahead)" << dendl; @@ -2562,7 +2565,19 @@ void Monitor::handle_command(MMonCommand *m) return; } - cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); + // check return value. If no prefix parameter provided, + // return value will be false, then return error info. + if(!cmd_getval(g_ceph_context, cmdmap, "prefix", prefix)) { + reply_command(m, -EINVAL, "command prefix not found", 0); + return; + } + + // check prefix is empty + if (prefix.empty()) { + reply_command(m, -EINVAL, "command prefix must not be empty", 0); + return; + } + if (prefix == "get_command_descriptions") { bufferlist rdata; Formatter *f = Formatter::create("json"); @@ -2583,6 +2598,15 @@ void Monitor::handle_command(MMonCommand *m) boost::scoped_ptr f(Formatter::create(format)); get_str_vec(prefix, fullcmd); + + // make sure fullcmd is not empty. + // invalid prefix will cause empty vector fullcmd. + // such as, prefix=";,,;" + if (fullcmd.empty()) { + reply_command(m, -EINVAL, "command requires a prefix to be valid", 0); + return; + } + module = fullcmd[0]; // validate command is in leader map @@ -3683,8 +3707,7 @@ void Monitor::timecheck_start_round() timecheck(); out: dout(10) << __func__ << " setting up next event" << dendl; - timecheck_event = new C_TimeCheck(this); - timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); + timecheck_reset_event(); } void Monitor::timecheck_finish_round(bool success) @@ -3698,6 +3721,7 @@ void Monitor::timecheck_finish_round(bool success) assert(timecheck_waiting.empty()); assert(timecheck_acks == quorum.size()); timecheck_report(); + timecheck_check_skews(); return; } @@ -3731,6 +3755,69 @@ void Monitor::timecheck_cleanup() timecheck_waiting.clear(); timecheck_skews.clear(); timecheck_latencies.clear(); + + timecheck_rounds_since_clean = 0; +} + +void Monitor::timecheck_reset_event() +{ + if (timecheck_event) { + timer.cancel_event(timecheck_event); + timecheck_event = NULL; + } + + double delay = + cct->_conf->mon_timecheck_skew_interval * timecheck_rounds_since_clean; + + if (delay <= 0 || delay > cct->_conf->mon_timecheck_interval) { + delay = cct->_conf->mon_timecheck_interval; + } + + dout(10) << __func__ << " delay " << delay + << " rounds_since_clean " << timecheck_rounds_since_clean + << dendl; + + timecheck_event = new C_TimeCheck(this); + timer.add_event_after(delay, timecheck_event); +} + +void Monitor::timecheck_check_skews() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + assert((timecheck_round % 2) == 0); + if (monmap->size() == 1) { + assert(0 == "We are alone; we shouldn't have gotten here!"); + return; + } + assert(timecheck_latencies.size() == timecheck_skews.size()); + + bool found_skew = false; + for (map::iterator p = timecheck_skews.begin(); + p != timecheck_skews.end(); ++p) { + + double abs_skew; + if (timecheck_has_skew(p->second, &abs_skew)) { + dout(10) << __func__ + << " " << p->first << " skew " << abs_skew << dendl; + found_skew = true; + } + } + + if (found_skew) { + ++timecheck_rounds_since_clean; + timecheck_reset_event(); + } else if (timecheck_rounds_since_clean > 0) { + dout(1) << __func__ + << " no clock skews found after " << timecheck_rounds_since_clean + << " rounds" << dendl; + // make sure the skews are really gone and not just a transient success + // this will run just once if not in the presence of skews again. + timecheck_rounds_since_clean = 1; + timecheck_reset_event(); + timecheck_rounds_since_clean = 0; + } + } void Monitor::timecheck_report() @@ -3753,7 +3840,8 @@ void Monitor::timecheck_report() m->epoch = get_epoch(); m->round = timecheck_round; - for (map::iterator it = timecheck_skews.begin(); it != timecheck_skews.end(); ++it) { + for (map::iterator it = timecheck_skews.begin(); + it != timecheck_skews.end(); ++it) { double skew = it->second; double latency = timecheck_latencies[it->first]; @@ -3812,10 +3900,10 @@ health_status_t Monitor::timecheck_status(ostringstream &ss, const double latency) { health_status_t status = HEALTH_OK; - double abs_skew = (skew_bound > 0 ? skew_bound : -skew_bound); assert(latency >= 0); - if (abs_skew > g_conf->mon_clock_drift_allowed) { + double abs_skew; + if (timecheck_has_skew(skew_bound, &abs_skew)) { status = HEALTH_WARN; ss << "clock skew " << abs_skew << "s" << " > max " << g_conf->mon_clock_drift_allowed << "s"; @@ -3929,11 +4017,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m) << " delta " << delta << " skew_bound " << skew_bound << " latency " << latency << dendl; - if (timecheck_skews.count(other) == 0) { - timecheck_skews[other] = skew_bound; - } else { - timecheck_skews[other] = (timecheck_skews[other]*0.8)+(skew_bound*0.2); - } + timecheck_skews[other] = skew_bound; timecheck_acks++; if (timecheck_acks == quorum.size()) { @@ -4544,7 +4628,7 @@ int Monitor::write_default_keyring(bufferlist& bl) os << g_conf->mon_data << "/keyring"; int err = 0; - int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT, 0644); + int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT, 0600); if (fd < 0) { err = -errno; dout(0) << __func__ << " failed to open " << os.str() diff --git a/ceph/src/mon/Monitor.h b/ceph/src/mon/Monitor.h index 0d3978a5..89af554d 100644 --- a/ceph/src/mon/Monitor.h +++ b/ceph/src/mon/Monitor.h @@ -54,6 +54,7 @@ #include "include/memory.h" #include "include/str_map.h" #include +#include #define CEPH_MON_PROTOCOL 13 /* cluster internal */ @@ -463,6 +464,15 @@ private: version_t timecheck_round; unsigned int timecheck_acks; utime_t timecheck_round_start; + /* When we hit a skew we will start a new round based off of + * 'mon_timecheck_skew_interval'. Each new round will be backed off + * until we hit 'mon_timecheck_interval' -- which is the typical + * interval when not in the presence of a skew. + * + * This variable tracks the number of rounds with skews since last clean + * so that we can report to the user and properly adjust the backoff. + */ + uint64_t timecheck_rounds_since_clean; /** * Time Check event. */ @@ -482,6 +492,8 @@ private: void timecheck_finish_round(bool success = true); void timecheck_cancel_round(); void timecheck_cleanup(); + void timecheck_reset_event(); + void timecheck_check_skews(); void timecheck_report(); void timecheck(); health_status_t timecheck_status(ostringstream &ss, @@ -490,6 +502,16 @@ private: void handle_timecheck_leader(MTimeCheck *m); void handle_timecheck_peon(MTimeCheck *m); void handle_timecheck(MTimeCheck *m); + + /** + * Returns 'true' if this is considered to be a skew; 'false' otherwise. + */ + bool timecheck_has_skew(const double skew_bound, double *abs) const { + double abs_skew = std::fabs(skew_bound); + if (abs) + *abs = abs_skew; + return (abs_skew > g_conf->mon_clock_drift_allowed); + } /** * @} */ diff --git a/ceph/src/mon/OSDMonitor.cc b/ceph/src/mon/OSDMonitor.cc index 968efc43..a006dbde 100644 --- a/ceph/src/mon/OSDMonitor.cc +++ b/ceph/src/mon/OSDMonitor.cc @@ -16,6 +16,7 @@ * */ +#include #include #include "OSDMonitor.h" @@ -461,21 +462,39 @@ void OSDMonitor::update_logger() mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch()); } +struct Sorter { + + double average_util; + + Sorter(const double average_util_) + : average_util(average_util_) + {} + + bool operator()(std::pair l, std::pair r) { + return abs(l.second - average_util) > abs(r.second - average_util); + } +}; + /* Assign a lower weight to overloaded OSDs. * * The osds that will get a lower weight are those with with a utilization * percentage 'oload' percent greater than the average utilization. */ -int OSDMonitor::reweight_by_utilization(int oload, std::string& out_str, - bool by_pg, const set *pools) +int OSDMonitor::reweight_by_utilization(int oload, + double max_changef, + int max_osds, + bool by_pg, const set *pools, + bool no_increasing, + bool dry_run, + std::stringstream *ss, + std::string *out_str, + Formatter *f) { if (oload <= 100) { - ostringstream oss; - oss << "You must give a percentage higher than 100. " + *ss << "You must give a percentage higher than 100. " "The reweighting threshold will be calculated as " "times . For example, an argument of 200 would " "reweight OSDs which are twice as utilized as the average OSD.\n"; - out_str = oss.str(); return -EINVAL; } @@ -511,10 +530,8 @@ int OSDMonitor::reweight_by_utilization(int oload, std::string& out_str, } if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) { - ostringstream oss; - oss << "Refusing to reweight: we only have " << num_pg_copies + *ss << "Refusing to reweight: we only have " << num_pg_copies << " PGs across " << num_osds << " osds!\n"; - out_str = oss.str(); return -EDOM; } @@ -525,17 +542,15 @@ int OSDMonitor::reweight_by_utilization(int oload, std::string& out_str, if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd < g_conf->mon_reweight_min_bytes_per_osd) { ostringstream oss; - oss << "Refusing to reweight: we only have " << pgm.osd_sum.kb + *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb << " kb across all osds!\n"; - out_str = oss.str(); return -EDOM; } if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd < g_conf->mon_reweight_min_bytes_per_osd) { ostringstream oss; - oss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used + *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used << " kb used across all osds!\n"; - out_str = oss.str(); return -EDOM; } @@ -548,61 +563,129 @@ int OSDMonitor::reweight_by_utilization(int oload, std::string& out_str, // but aggressively adjust weights up whenever possible. double underload_util = average_util; + unsigned max_change = (unsigned)(max_changef * (double)0x10000); + ostringstream oss; - char buf[128]; - snprintf(buf, sizeof(buf), "average %04f, overload %04f. ", - average_util, overload_util); - oss << buf; - std::string sep; - oss << "reweighted: "; + if (f) { + f->open_object_section("reweight_by_utilization"); + f->dump_unsigned("overload_min", oload); + f->dump_float("max_change", max_changef); + f->dump_float("max_change_osds", max_osds); + f->dump_float("average_utilization", average_util); + f->dump_float("overload_utilization", overload_util); + } else { + oss << "oload " << oload << "\n"; + oss << "max_change " << max_changef << "\n"; + oss << "max_change_osds " << max_osds << "\n"; + char buf[128]; + snprintf(buf, sizeof(buf), "average %04f\noverload %04f\n", + average_util, overload_util); + oss << buf; + } bool changed = false; + int num_changed = 0; + + // precompute util for each OSD + std::vector > util_by_osd; for (ceph::unordered_map::const_iterator p = - pgm.osd_stat.begin(); + pgm.osd_stat.begin(); p != pgm.osd_stat.end(); ++p) { - float util; + std::pair osd_util; + osd_util.first = p->first; if (by_pg) { - util = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first); + osd_util.second = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first); } else { - util = (double)p->second.kb_used / (double)p->second.kb; + osd_util.second = (double)p->second.kb_used / (double)p->second.kb; } + util_by_osd.push_back(osd_util); + } + + // sort by absolute deviation from the mean utilization, + // in descending order. + std::sort(util_by_osd.begin(), util_by_osd.end(), Sorter(average_util)); + + OSDMap::Incremental newinc; + + if (f) + f->open_array_section("reweights"); + + for (std::vector >::const_iterator p = + util_by_osd.begin(); + p != util_by_osd.end(); + ++p) { + float util = p->second; + if (util >= overload_util) { - sep = ", "; // Assign a lower weight to overloaded OSDs. The current weight // is a factor to take into account the original weights, // to represent e.g. differing storage capacities unsigned weight = osdmap.get_weight(p->first); unsigned new_weight = (unsigned)((average_util / util) * (float)weight); - pending_inc.new_weight[p->first] = new_weight; - char buf[128]; - snprintf(buf, sizeof(buf), "osd.%d [%04f -> %04f]", p->first, - (float)weight / (float)0x10000, - (float)new_weight / (float)0x10000); - oss << buf << sep; - changed = true; + if (weight > max_change) + new_weight = MAX(new_weight, weight - max_change); + newinc.new_weight[p->first] = new_weight; + if (!dry_run) { + pending_inc.new_weight[p->first] = new_weight; + changed = true; + } + if (f) { + f->open_object_section("osd"); + f->dump_unsigned("osd", p->first); + f->dump_float("weight", (float)weight / (float)0x10000); + f->dump_float("new_weight", (float)new_weight / (float)0x10000); + f->close_section(); + } else { + char buf[128]; + snprintf(buf, sizeof(buf), "osd.%d weight %04f -> %04f\n", p->first, + (float)weight / (float)0x10000, + (float)new_weight / (float)0x10000); + oss << buf; + } + if (++num_changed >= max_osds) + break; } - if (util <= underload_util) { + if (!no_increasing && util <= underload_util) { // assign a higher weight.. if we can. unsigned weight = osdmap.get_weight(p->first); unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + new_weight = MIN(new_weight, weight + max_change); if (new_weight > 0x10000) new_weight = 0x10000; if (new_weight > weight) { - sep = ", "; - pending_inc.new_weight[p->first] = new_weight; + newinc.new_weight[p->first] = new_weight; + if (!dry_run) { + pending_inc.new_weight[p->first] = new_weight; + changed = true; + } char buf[128]; - snprintf(buf, sizeof(buf), "osd.%d [%04f -> %04f]", p->first, + snprintf(buf, sizeof(buf), "osd.%d weight %04f -> %04f\n", p->first, (float)weight / (float)0x10000, (float)new_weight / (float)0x10000); - oss << buf << sep; - changed = true; + oss << buf; + if (++num_changed >= max_osds) + break; } } } - if (sep.empty()) { - oss << "(none)"; + if (f) { + f->close_section(); + } + + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + newinc.fsid = newmap.fsid; + newinc.epoch = newmap.get_epoch() + 1; + newmap.apply_incremental(newinc); + + osdmap.summarize_mapping_stats(&newmap, pools, out_str, f); + + if (f) { + f->close_section(); + } else { + *out_str += "\n"; + *out_str += oss.str(); } - out_str = oss.str(); dout(10) << "reweight_by_utilization: finished with " << out_str << dendl; return changed; } @@ -1411,6 +1494,13 @@ void OSDMonitor::check_failures(utime_t now) bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) { + // already pending failure? + if (pending_inc.new_state.count(target_osd) && + pending_inc.new_state[target_osd] & CEPH_OSD_UP) { + dout(10) << " already pending failure" << dendl; + return true; + } + utime_t orig_grace(g_conf->osd_heartbeat_grace, 0); utime_t max_failed_since = fi.get_failed_since(); utime_t failed_for = now - max_failed_since; @@ -1454,13 +1544,6 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) << grace << " grace (" << orig_grace << " + " << my_grace << " + " << peer_grace << "), max_failed_since " << max_failed_since << dendl; - // already pending failure? - if (pending_inc.new_state.count(target_osd) && - pending_inc.new_state[target_osd] & CEPH_OSD_UP) { - dout(10) << " already pending failure" << dendl; - return true; - } - if (failed_for >= grace && ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters) && (fi.num_reports >= g_conf->mon_osd_min_down_reports)) { @@ -1565,6 +1648,9 @@ void OSDMonitor::take_all_failures(list& ls) failure_info.clear(); } +static bool uses_gmt_hitset(const std::pair& pool) { + return pool.second.use_gmt_hitset; +} // boot -- @@ -1634,6 +1720,19 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m) } } + if (std::find_if(osdmap.get_pools().begin(), + osdmap.get_pools().end(), + uses_gmt_hitset) != osdmap.get_pools().end()) { + assert(osdmap.get_num_up_osds() == 0 || + osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT); + if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) { + dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at " + << m->get_orig_source_inst() + << " doesn't announce support -- ignore" << dendl; + goto ignore; + } + } + // already booted? if (osdmap.is_up(from) && osdmap.get_inst(from) == m->get_orig_source_inst()) { @@ -1936,6 +2035,21 @@ bool OSDMonitor::preprocess_pgtemp(MOSDPGTemp *m) continue; } + int acting_primary = -1; + osdmap.pg_to_up_acting_osds( + p->first, NULL, NULL, NULL, &acting_primary); + if (acting_primary != from) { + /* If the source isn't the primary based on the current osdmap, we know + * that the interval changed and that we can discard this message. + * Indeed, we must do so to avoid 16127 since we can't otherwise determine + * which of two pg temp mappings on the same pg is more recent. + */ + dout(10) << __func__ << " ignore " << p->first << " -> " << p->second + << ": primary has changed" << dendl; + ignore_cnt++; + continue; + } + // removal? if (p->second.empty() && (osdmap.pg_temp->count(p->first) || osdmap.primary_temp->count(p->first))) @@ -2790,6 +2904,15 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch(); rdata.append(ds); } + } else if (prefix == "osd utilization") { + string out; + osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get()); + if (f) + f->flush(rdata); + else + rdata.append(out); + r = 0; + goto reply; } else if (prefix == "osd find") { int64_t osd; if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) { @@ -3051,6 +3174,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) if (!p->is_tier() && (var == "hit_set_type" || var == "hit_set_period" || var == "hit_set_count" || var == "hit_set_fpp" || + var == "use_gmt_hitset" || var == "target_max_objects" || var == "target_max_bytes" || var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" || @@ -3103,6 +3227,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) BloomHitSet::Params *bloomp = static_cast(p->hit_set_params.impl.get()); f->dump_float("hit_set_fpp", bloomp->get_fpp()); } + } else if (var == "use_gmt_hitset") { + f->dump_bool("use_gmt_hitset", p->use_gmt_hitset); } else if (var == "target_max_objects") { f->dump_unsigned("target_max_objects", p->target_max_objects); } else if (var == "target_max_bytes") { @@ -3160,6 +3286,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) } BloomHitSet::Params *bloomp = static_cast(p->hit_set_params.impl.get()); ss << "hit_set_fpp: " << bloomp->get_fpp(); + } else if (var == "use_gmt_hitset") { + ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n"; } else if (var == "target_max_objects") { ss << "target_max_objects: " << p->target_max_objects; } else if (var == "target_max_bytes") { @@ -3533,8 +3661,8 @@ void OSDMonitor::get_pools_health( detail->push_back(make_pair(HEALTH_WARN, ss.str())); } - float warn_threshold = g_conf->mon_pool_quota_warn_threshold/100; - float crit_threshold = g_conf->mon_pool_quota_crit_threshold/100; + float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100; + float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100; if (pool.quota_max_objects > 0) { stringstream ss; @@ -4042,6 +4170,11 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE); if (g_conf->osd_pool_default_flag_nosizechange) pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE); + if (g_conf->osd_pool_use_gmt_hitset && + (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) + pi->use_gmt_hitset = true; + else + pi->use_gmt_hitset = false; pi->size = size; pi->min_size = min_size; @@ -4385,6 +4518,17 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, } BloomHitSet::Params *bloomp = static_cast(p.hit_set_params.impl.get()); bloomp->set_fpp(f); + } else if (var == "use_gmt_hitset") { + if (val == "true" || (interr.empty() && n == 1)) { + if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) { + ss << "not all OSDs support GMT hit set."; + return -EINVAL; + } + p.use_gmt_hitset = true; + } else { + ss << "expecting value 'true' or '1'"; + return -EINVAL; + } } else if (var == "debug_fake_ec_pool") { if (val == "true" || (interr.empty() && n == 1)) { p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL; @@ -6563,23 +6707,15 @@ done: get_last_committed() + 1)); return true; - } else if (prefix == "osd reweight-by-utilization") { - int64_t oload; - cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120)); - string out_str; - err = reweight_by_utilization(oload, out_str, false, NULL); - if (err < 0) { - ss << "FAILED reweight-by-utilization: " << out_str; - } else if (err == 0) { - ss << "no change: " << out_str; - } else { - ss << "SUCCESSFUL reweight-by-utilization: " << out_str; - getline(ss, rs); - wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, - get_last_committed() + 1)); - return true; - } - } else if (prefix == "osd reweight-by-pg") { + } else if (prefix == "osd reweight-by-pg" || + prefix == "osd reweight-by-utilization" || + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization") { + bool by_pg = + prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg"; + bool dry_run = + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization"; int64_t oload; cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120)); set pools; @@ -6594,18 +6730,38 @@ done: } pools.insert(pool); } + double max_change = g_conf->mon_reweight_max_change; + cmd_getval(g_ceph_context, cmdmap, "max_change", max_change); + if (max_change <= 0.0) { + ss << "max_change " << max_change << " must be positive"; + err = -EINVAL; + goto reply; + } + int64_t max_osds = g_conf->mon_reweight_max_osds; + cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds); + string no_increasing; + cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing); string out_str; - err = reweight_by_utilization(oload, out_str, true, - pools.empty() ? NULL : &pools); + err = reweight_by_utilization(oload, + max_change, + max_osds, + by_pg, + pools.empty() ? NULL : &pools, + no_increasing == "--no-increasing", + dry_run, + &ss, &out_str, f.get()); + if (f) + f->flush(rdata); + else + rdata.append(out_str); if (err < 0) { - ss << "FAILED reweight-by-pg: " << out_str; + ss << "FAILED reweight-by-pg"; } else if (err == 0) { - ss << "no change: " << out_str; + ss << "no change"; } else { - ss << "SUCCESSFUL reweight-by-pg: " << out_str; - getline(ss, rs); - wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, - get_last_committed() + 1)); + ss << "SUCCESSFUL reweight-by-pg"; + wait_for_finished_proposal( + new Monitor::C_Command(mon, m, 0, rs, rdata, get_last_committed() + 1)); return true; } } else if (prefix == "osd thrash") { diff --git a/ceph/src/mon/OSDMonitor.h b/ceph/src/mon/OSDMonitor.h index 3c70cf24..d8a3e1fa 100644 --- a/ceph/src/mon/OSDMonitor.h +++ b/ceph/src/mon/OSDMonitor.h @@ -217,9 +217,16 @@ private: void send_incremental(PaxosServiceMessage *m, epoch_t first); void send_incremental(epoch_t first, MonSession *session, bool onetime); - int reweight_by_utilization(int oload, std::string& out_str, bool by_pg, - const set *pools); - + int reweight_by_utilization(int oload, + double max_change, + int max_osds, + bool by_pg, + const set *pools, + bool no_increasing, + bool dry_run, + std::stringstream *ss, + std::string *out_str, + Formatter *f); void print_utilization(ostream &out, Formatter *f, bool tree) const; bool check_source(PaxosServiceMessage *m, uuid_d fsid); diff --git a/ceph/src/mon/PGMonitor.cc b/ceph/src/mon/PGMonitor.cc index 3c2b756a..8ab181a3 100644 --- a/ceph/src/mon/PGMonitor.cc +++ b/ceph/src/mon/PGMonitor.cc @@ -1252,8 +1252,12 @@ inline string percentify(const float& a) { //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f, void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f, object_stat_sum_t &sum, uint64_t avail, - bool verbose) + float raw_used_rate, bool verbose) { + float curr_object_copies_rate = 0.0; + if (sum.num_object_copies > 0) + curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies; + if (f) { f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10)); f->dump_int("bytes_used", sum.num_bytes); @@ -1265,20 +1269,22 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f, f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull); f->dump_int("wr", sum.num_wr); f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull); + f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate); } } else { tbl << stringify(si_t(sum.num_bytes)); int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10); float used = 0.0; if (pg_map.osd_sum.kb > 0) - used = (float)kb_used / pg_map.osd_sum.kb; + used = (float)kb_used * raw_used_rate * curr_object_copies_rate / pg_map.osd_sum.kb; tbl << percentify(used*100); tbl << si_t(avail); tbl << sum.num_objects; if (verbose) { tbl << stringify(si_t(sum.num_objects_dirty)) - << stringify(si_t(sum.num_rd)) - << stringify(si_t(sum.num_wr)); + << stringify(si_t(sum.num_rd)) + << stringify(si_t(sum.num_wr)) + << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate)); } } } @@ -1333,6 +1339,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose) tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT); tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT); tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT); } } @@ -1351,6 +1358,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose) pool->get_type(), pool->get_size()); int64_t avail; + float raw_used_rate; if (avail_by_rule.count(ruleno) == 0) { avail = get_rule_avail(osdmap, ruleno); if (avail < 0) @@ -1362,20 +1370,24 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose) switch (pool->get_type()) { case pg_pool_t::TYPE_REPLICATED: avail /= pool->get_size(); + raw_used_rate = pool->get_size(); break; case pg_pool_t::TYPE_ERASURE: - { - const map& ecp = - osdmap.get_erasure_code_profile(pool->erasure_code_profile); - map::const_iterator pm = ecp.find("m"); - map::const_iterator pk = ecp.find("k"); - if (pm != ecp.end() && pk != ecp.end()) { - int k = atoi(pk->second.c_str()); - int m = atoi(pm->second.c_str()); - avail = avail * k / (m + k); - } + { + const map& ecp = + osdmap.get_erasure_code_profile(pool->erasure_code_profile); + map::const_iterator pm = ecp.find("m"); + map::const_iterator pk = ecp.find("k"); + if (pm != ecp.end() && pk != ecp.end()) { + int k = atoi(pk->second.c_str()); + int m = atoi(pm->second.c_str()); + avail = avail * k / (m + k); + raw_used_rate = (float)(m + k) / k; + } else { + raw_used_rate = 0.0; } break; + } default: assert(0 == "unrecognized pool type"); } @@ -1391,7 +1403,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose) if (verbose) tbl << "-"; } - dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose); + dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose); if (f) f->close_section(); // stats else diff --git a/ceph/src/mon/PGMonitor.h b/ceph/src/mon/PGMonitor.h index 97a9ac11..4161a029 100644 --- a/ceph/src/mon/PGMonitor.h +++ b/ceph/src/mon/PGMonitor.h @@ -146,8 +146,9 @@ private: vector& args) const; void dump_object_stat_sum(TextTable &tbl, Formatter *f, - object_stat_sum_t &sum, + object_stat_sum_t &sum, uint64_t avail, + float raw_used_rate, bool verbose); int64_t get_rule_avail(OSDMap& osdmap, int ruleno); diff --git a/ceph/src/msg/simple/Pipe.cc b/ceph/src/msg/simple/Pipe.cc index ab277e08..1bc819a2 100644 --- a/ceph/src/msg/simple/Pipe.cc +++ b/ceph/src/msg/simple/Pipe.cc @@ -479,13 +479,21 @@ int Pipe::accept() * held by somebody trying to make use of the SimpleMessenger lock. * So drop locks, wait, and retry. It just looks like a slow network * to everybody else. + * + * We take a ref to existing here since it might get reaped before we + * wake up (see bug #15870). We can be confident that it lived until + * locked it since we held the msgr lock from _lookup_pipe through to + * locking existing->lock and checking reader_dispatching. */ + existing->get(); pipe_lock.Unlock(); msgr->lock.Unlock(); existing->notify_on_dispatch_done = true; while (existing->reader_dispatching) existing->cond.Wait(existing->pipe_lock); existing->pipe_lock.Unlock(); + existing->put(); + existing = 0; goto retry_existing_lookup; } diff --git a/ceph/src/os/FileJournal.cc b/ceph/src/os/FileJournal.cc index c6bb6f2c..8c2635e2 100644 --- a/ceph/src/os/FileJournal.cc +++ b/ceph/src/os/FileJournal.cc @@ -45,17 +45,6 @@ int FileJournal::_open(bool forwrite, bool create) { int flags, ret; - if (aio && !directio) { - derr << "FileJournal::_open: aio not supported without directio; disabling aio" << dendl; - aio = false; - } -#ifndef HAVE_LIBAIO - if (aio) { - derr << "FileJournal::_open: libaio not compiled in; disabling aio" << dendl; - aio = false; - } -#endif - if (forwrite) { flags = O_RDWR; if (directio) @@ -331,15 +320,17 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, return 0; } +// This can not be used on an active journal int FileJournal::check() { int ret; + assert(fd == -1); ret = _open(false, false); if (ret) - goto done; + return ret; - ret = read_header(); + ret = read_header(&header); if (ret < 0) goto done; @@ -354,8 +345,7 @@ int FileJournal::check() ret = 0; done: - VOID_TEMP_FAILURE_RETRY(::close(fd)); - fd = -1; + close(); return ret; } @@ -386,7 +376,7 @@ int FileJournal::create() header.start = get_top(); header.start_seq = 0; - print_header(); + print_header(header); // static zeroed buffer for alignment padding delete [] zero_buf; @@ -443,16 +433,20 @@ done: return ret; } +// This can not be used on an active journal int FileJournal::peek_fsid(uuid_d& fsid) { + assert(fd == -1); int r = _open(false, false); if (r) return r; - r = read_header(); + r = read_header(&header); if (r < 0) - return r; + goto out; fsid = header.fsid; - return 0; +out: + close(); + return r; } int FileJournal::open(uint64_t fs_op_seq) @@ -470,7 +464,7 @@ int FileJournal::open(uint64_t fs_op_seq) write_pos = get_top(); // read header? - err = read_header(); + err = read_header(&header); if (err < 0) return err; @@ -556,6 +550,11 @@ int FileJournal::open(uint64_t fs_op_seq) return 0; } +void FileJournal::_close(int fd) const +{ + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + void FileJournal::close() { dout(1) << "close " << fn << dendl; @@ -567,61 +566,120 @@ void FileJournal::close() assert(writeq_empty()); assert(!must_write_header); assert(fd >= 0); - VOID_TEMP_FAILURE_RETRY(::close(fd)); + _close(fd); fd = -1; } int FileJournal::dump(ostream& out) { - int err = 0; + return _dump(out, false); +} + +int FileJournal::simple_dump(ostream& out) +{ + return _dump(out, true); +} + +int FileJournal::_dump(ostream& out, bool simple) +{ + JSONFormatter f(true); + int ret = _fdump(f, simple); + f.flush(out); + return ret; +} + +int FileJournal::_fdump(Formatter &f, bool simple) +{ + dout(10) << "_fdump" << dendl; - dout(10) << "dump" << dendl; - err = _open(false, false); + assert(fd == -1); + int err = _open(false, false); if (err) return err; - err = read_header(); - if (err < 0) + err = read_header(&header); + if (err < 0) { + close(); return err; + } - read_pos = header.start; + off64_t next_pos = header.start; - JSONFormatter f(true); + f.open_object_section("journal"); - f.open_array_section("journal"); - uint64_t seq = 0; + f.open_object_section("header"); + f.dump_unsigned("flags", header.flags); + ostringstream os; + os << header.fsid; + f.dump_string("fsid", os.str()); + f.dump_unsigned("block_size", header.block_size); + f.dump_unsigned("alignment", header.alignment); + f.dump_int("max_size", header.max_size); + f.dump_int("start", header.start); + f.dump_unsigned("committed_up_to", header.committed_up_to); + f.dump_unsigned("start_seq", header.start_seq); + f.close_section(); + + f.open_array_section("entries"); + uint64_t seq = header.start_seq; while (1) { bufferlist bl; - uint64_t pos = read_pos; - if (!read_entry(bl, seq)) { - dout(3) << "journal_replay: end of journal, done." << dendl; + off64_t pos = next_pos; + + if (!pos) { + dout(2) << "_dump -- not readable" << dendl; + return false; + } + stringstream ss; + read_entry_result result = do_read_entry( + pos, + &next_pos, + &bl, + &seq, + &ss); + if (result != SUCCESS) { + if (seq < header.committed_up_to) { + dout(2) << "Unable to read past sequence " << seq + << " but header indicates the journal has committed up through " + << header.committed_up_to << ", journal is corrupt" << dendl; + err = EINVAL; + } + dout(25) << ss.str() << dendl; + dout(25) << "No further valid entries found, journal is most likely valid" + << dendl; break; } f.open_object_section("entry"); f.dump_unsigned("offset", pos); f.dump_unsigned("seq", seq); - f.open_array_section("transactions"); - bufferlist::iterator p = bl.begin(); - int trans_num = 0; - while (!p.end()) { - ObjectStore::Transaction *t = new ObjectStore::Transaction(p); - f.open_object_section("transaction"); - f.dump_unsigned("trans_num", trans_num); - t->dump(&f); + if (simple) { + f.dump_unsigned("bl.length", bl.length()); + } else { + f.open_array_section("transactions"); + bufferlist::iterator p = bl.begin(); + int trans_num = 0; + while (!p.end()) { + ObjectStore::Transaction *t = new ObjectStore::Transaction(p); + f.open_object_section("transaction"); + f.dump_unsigned("trans_num", trans_num); + t->dump(&f); + f.close_section(); + delete t; + trans_num++; + } f.close_section(); - delete t; - trans_num++; } f.close_section(); - f.close_section(); - f.flush(cout); } + f.close_section(); f.close_section(); dout(10) << "dump finish" << dendl; - return 0; + + close(); + return err; } @@ -638,21 +696,28 @@ void FileJournal::start_writer() void FileJournal::stop_writer() { + // Do nothing if writer already stopped or never started + if (!write_stop) { - Mutex::Locker l(write_lock); - Mutex::Locker p(writeq_lock); - write_stop = true; - writeq_cond.Signal(); - // Doesn't hurt to signal commit_cond in case thread is waiting there - // and caller didn't use committed_thru() first. - commit_cond.Signal(); + { + Mutex::Locker l(write_lock); + Mutex::Locker p(writeq_lock); + write_stop = true; + writeq_cond.Signal(); + // Doesn't hurt to signal commit_cond in case thread is waiting there + // and caller didn't use committed_thru() first. + commit_cond.Signal(); + } + write_thread.join(); + + // write journal header now so that we have less to replay on remount + write_header_sync(); } - write_thread.join(); #ifdef HAVE_LIBAIO // stop aio completeion thread *after* writer thread has stopped // and has submitted all of its io - if (aio) { + if (aio && !aio_stop) { aio_lock.Lock(); aio_stop = true; aio_cond.Signal(); @@ -665,7 +730,7 @@ void FileJournal::stop_writer() -void FileJournal::print_header() +void FileJournal::print_header(const header_t &header) const { dout(10) << "header: block_size " << header.block_size << " alignment " << header.alignment @@ -675,7 +740,7 @@ void FileJournal::print_header() dout(10) << " write_pos " << write_pos << dendl; } -int FileJournal::read_header() +int FileJournal::read_header(header_t *hdr) const { dout(10) << "read_header" << dendl; bufferlist bl; @@ -694,7 +759,7 @@ int FileJournal::read_header() try { bufferlist::iterator p = bl.begin(); - ::decode(header, p); + ::decode(*hdr, p); } catch (buffer::error& e) { derr << "read_header error decoding journal header" << dendl; @@ -709,12 +774,12 @@ int FileJournal::read_header() * remove this or else this (eventually old) code will clobber newer * code's flags. */ - if (header.flags > 3) { + if (hdr->flags > 3) { derr << "read_header appears to have gibberish flags; assuming 0" << dendl; - header.flags = 0; + hdr->flags = 0; } - print_header(); + print_header(*hdr); return 0; } @@ -733,7 +798,14 @@ bufferptr FileJournal::prepare_header() return bp; } - +void FileJournal::write_header_sync() +{ + Mutex::Locker locker(write_lock); + must_write_header = true; + bufferlist bl; + do_write(bl); + dout(20) << __func__ << " finish" << dendl; +} int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size) { @@ -809,7 +881,7 @@ int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_ put_throttle(1, peek_write().bl.length()); pop_write(); } - print_header(); + print_header(header); } return -ENOSPC; // hrm, full on first op @@ -1216,7 +1288,7 @@ void FileJournal::write_thread_entry() put_throttle(1, peek_write().bl.length()); pop_write(); } - print_header(); + print_header(header); r = 0; } else { dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl; @@ -1641,7 +1713,7 @@ void FileJournal::committed_thru(uint64_t seq) } must_write_header = true; - print_header(); + print_header(header); // committed but unjournaled items while (!writeq_empty() && peek_write().seq <= seq) { @@ -1700,7 +1772,7 @@ void FileJournal::wrap_read_bl( int64_t olen, bufferlist* bl, off64_t *out_pos - ) + ) const { while (olen > 0) { while (pos >= header.max_size) @@ -1756,6 +1828,7 @@ bool FileJournal::read_entry( &seq, &ss); if (result == SUCCESS) { + journalq.push_back( pair(seq, pos)); if (next_seq > seq) { return false; } else { @@ -1768,7 +1841,7 @@ bool FileJournal::read_entry( } stringstream errss; - if (seq < header.committed_up_to) { + if (seq && seq < header.committed_up_to) { derr << "Unable to read past sequence " << seq << " but header indicates the journal has committed up through " << header.committed_up_to << ", journal is corrupt" << dendl; @@ -1793,7 +1866,7 @@ FileJournal::read_entry_result FileJournal::do_read_entry( bufferlist *bl, uint64_t *seq, ostream *ss, - entry_header_t *_h) + entry_header_t *_h) const { off64_t cur_pos = init_pos; bufferlist _bl; @@ -1863,11 +1936,6 @@ FileJournal::read_entry_result FileJournal::do_read_entry( if (seq) *seq = h->seq; - // works around an apparent GCC 4.8(?) compiler bug about unaligned - // bind by reference to (packed) h->seq - journalq.push_back( - pair(static_cast(h->seq), - static_cast(init_pos))); if (next_pos) *next_pos = cur_pos; diff --git a/ceph/src/os/FileJournal.h b/ceph/src/os/FileJournal.h index 574c902a..9e07b404 100644 --- a/ceph/src/os/FileJournal.h +++ b/ceph/src/os/FileJournal.h @@ -134,7 +134,7 @@ public: start = block_size; } - uint64_t get_fsid64() { + uint64_t get_fsid64() const { return *(uint64_t*)&fsid.uuid[0]; } @@ -214,6 +214,8 @@ public: } } __attribute__((__packed__, aligned(4))); + bool journalq_empty() { return journalq.empty(); } + private: string fn; @@ -294,10 +296,12 @@ private: int _open(bool wr, bool create=false); int _open_block_device(); + void _close(int fd) const; void _check_disk_write_cache() const; int _open_file(int64_t oldsize, blksize_t blksize, bool create); - void print_header(); - int read_header(); + int _dump(ostream& out, bool simple); + void print_header(const header_t &hdr) const; + int read_header(header_t *hdr) const; bufferptr prepare_header(); void start_writer(); void stop_writer(); @@ -325,7 +329,7 @@ private: int64_t len, ///< [in] length to read bufferlist* bl, ///< [out] result off64_t *out_pos ///< [out] next position to read, will be wrapped - ); + ) const; void do_discard(int64_t offset, int64_t end); @@ -349,7 +353,7 @@ private: } } write_finish_thread; - off64_t get_top() { + off64_t get_top() const { return ROUND_UP_TO(sizeof(header), block_size); } @@ -382,11 +386,24 @@ private: throttle_ops(g_ceph_context, "filestore_ops", g_conf->journal_queue_max_ops), throttle_bytes(g_ceph_context, "filestore_bytes", g_conf->journal_queue_max_bytes), write_lock("FileJournal::write_lock", false, true, false, g_ceph_context), - write_stop(false), - aio_stop(false), + write_stop(true), + aio_stop(true), write_thread(this), - write_finish_thread(this) { } + write_finish_thread(this) { + + if (aio && !directio) { + derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl; + aio = false; + } +#ifndef HAVE_LIBAIO + if (aio) { + derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl; + aio = false; + } +#endif + } ~FileJournal() { + assert(fd == -1); delete[] zero_buf; } @@ -397,6 +414,8 @@ private: int peek_fsid(uuid_d& fsid); int dump(ostream& out); + int simple_dump(ostream& out); + int _fdump(Formatter &f, bool simple); void flush(); @@ -414,6 +433,8 @@ private: return full_state != FULL_NOTFULL && !write_stop; } + void write_header_sync(); + void set_wait_on_full(bool b) { wait_on_full = b; } // reads @@ -446,7 +467,7 @@ private: uint64_t *seq, ///< [out] seq of successful read ostream *ss, ///< [out] error output entry_header_t *h = 0 ///< [out] header - ); ///< @return result code + ) const; ///< @return result code bool read_entry( bufferlist &bl, diff --git a/ceph/src/os/LFNIndex.cc b/ceph/src/os/LFNIndex.cc index 5d6bd7b9..5aa06283 100644 --- a/ceph/src/os/LFNIndex.cc +++ b/ceph/src/os/LFNIndex.cc @@ -89,8 +89,19 @@ int LFNIndex::created(const ghobject_t &oid, const char *path) if (r < 0) goto out; r = lfn_created(path_comp, oid, short_name); - if (r < 0) + if (r < 0) { + if (failed) { + /* This is hacky, but the only way we get ENOENT from lfn_created here is + * if we did a failure injection in _created below AND actually started the + * split or merge. In that case, lfn_created already suceeded, and + * WRAP_RETRY already cleaned it up and we are actually done. In a real + * failure, the filestore itself would have ended up calling this with + * the new path, not the old one, so we'd find it. + */ + r = 0; + } goto out; + } r = _created(path_comp, oid, short_name); if (r < 0) goto out; @@ -939,10 +950,26 @@ int LFNIndex::lfn_translate(const vector &path, if (!lfn_is_hashed_filename(short_name)) { return lfn_parse_object_name(short_name, out); } - // Get lfn_attr string full_path = get_full_path(path, short_name); char attr[PATH_MAX]; - int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1); + // First, check alt attr + int r = chain_getxattr( + full_path.c_str(), + get_alt_lfn_attr().c_str(), + attr, + sizeof(attr) - 1); + if (r >= 0) { + // There is an alt attr, does it match? + if (r < (int)sizeof(attr)) + attr[r] = '\0'; + if (short_name_matches(short_name.c_str(), attr)) { + string long_name(attr); + return lfn_parse_object_name(long_name, out); + } + } + + // Get lfn_attr + r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1); if (r < 0) return -errno; if (r < (int)sizeof(attr)) @@ -1299,6 +1326,28 @@ void LFNIndex::build_filename(const char *old_filename, int i, char *filename, i } } +bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name) +{ + const char *end = short_name; + while (*end) ++end; + const char *suffix = end; + if (suffix > short_name) --suffix; // last char + while (suffix > short_name && *suffix != '_') --suffix; // back to first _ + if (suffix > short_name) --suffix; // one behind that + while (suffix > short_name && *suffix != '_') --suffix; // back to second _ + + int index = -1; + char buf[FILENAME_SHORT_LEN + 4]; + assert((end - suffix) < (int)sizeof(buf)); + int r = sscanf(suffix, "_%d_%s", &index, buf); + if (r < 2) + return false; + if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0) + return false; + build_filename(cand_long_name, index, buf, sizeof(buf)); + return strcmp(short_name, buf) == 0; +} + string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i) { string long_name = lfn_generate_object_name(oid); diff --git a/ceph/src/os/LFNIndex.h b/ceph/src/os/LFNIndex.h index 5cd35238..f3e5e437 100644 --- a/ceph/src/os/LFNIndex.h +++ b/ceph/src/os/LFNIndex.h @@ -576,6 +576,12 @@ private: const string &attr ///< [in] Attribute to mangle. ); ///< @return Mangled attribute name. + /// checks whether long_name could hash to short_name + bool short_name_matches( + const char *short_name, ///< [in] name to check against + const char *cand_long_name ///< [in] candidate long name + ); + /// Builds hashed filename void build_filename( const char *old_filename, ///< [in] Filename to convert. diff --git a/ceph/src/os/LevelDBStore.cc b/ceph/src/os/LevelDBStore.cc index 454fafb6..de458d16 100644 --- a/ceph/src/os/LevelDBStore.cc +++ b/ceph/src/os/LevelDBStore.cc @@ -74,12 +74,6 @@ int LevelDBStore::do_open(ostream &out, bool create_if_missing) return -EINVAL; } - if (g_conf->leveldb_compact_on_mount) { - derr << "Compacting leveldb store..." << dendl; - compact(); - derr << "Finished compacting leveldb store" << dendl; - } - PerfCountersBuilder plb(g_ceph_context, "leveldb", l_leveldb_first, l_leveldb_last); plb.add_u64_counter(l_leveldb_gets, "leveldb_get"); plb.add_u64_counter(l_leveldb_txns, "leveldb_transaction"); @@ -89,6 +83,12 @@ int LevelDBStore::do_open(ostream &out, bool create_if_missing) plb.add_u64(l_leveldb_compact_queue_len, "leveldb_compact_queue_len"); logger = plb.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); + + if (g_conf->leveldb_compact_on_mount) { + derr << "Compacting leveldb store..." << dendl; + compact(); + derr << "Finished compacting leveldb store" << dendl; + } return 0; } diff --git a/ceph/src/osd/ECBackend.cc b/ceph/src/osd/ECBackend.cc index 3b517402..8f2b571a 100644 --- a/ceph/src/osd/ECBackend.cc +++ b/ceph/src/osd/ECBackend.cc @@ -361,7 +361,17 @@ void ECBackend::handle_recovery_read_complete( from[i->first.shard].claim(i->second); } dout(10) << __func__ << ": " << from << dendl; - ECUtil::decode(sinfo, ec_impl, from, target); + if (ECUtil::decode(sinfo, ec_impl, from, target) != 0) { + derr << __func__ << ": inconsistent shard sizes " << hoid << " " + << " the offending shard must be manually removed " + << " after verifying there are enough shards to recover " + << "(" << to_read.get<0>() + << ", " << to_read.get<1>() + << ", " << to_read.get<2>() + << ")" + << dendl; + assert(0); + } if (attrs) { op.xattrs.swap(*attrs); @@ -1580,7 +1590,6 @@ void ECBackend::start_write(Op *op) { op->on_local_applied_sync = 0; } else { MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop); - r->set_priority(cct->_conf->osd_client_op_priority); r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard); r->map_epoch = get_parent()->get_epoch(); get_parent()->send_message_osd_cluster( @@ -1716,7 +1725,7 @@ void ECBackend::objects_read_async( c))); start_read_op( - cct->_conf->osd_client_op_priority, + CEPH_MSG_PRIO_DEFAULT, for_read_op, OpRequestRef()); return; diff --git a/ceph/src/osd/ECUtil.cc b/ceph/src/osd/ECUtil.cc index 1f3b4585..efc57b5e 100644 --- a/ceph/src/osd/ECUtil.cc +++ b/ceph/src/osd/ECUtil.cc @@ -56,7 +56,8 @@ int ECUtil::decode( for (map::iterator i = to_decode.begin(); i != to_decode.end(); ++i) { - assert(i->second.length() == total_chunk_size); + if (i->second.length() != total_chunk_size) + return -EINVAL; } if (total_chunk_size == 0) diff --git a/ceph/src/osd/OSD.cc b/ceph/src/osd/OSD.cc index f121a3df..fc9674d0 100644 --- a/ceph/src/osd/OSD.cc +++ b/ceph/src/osd/OSD.cc @@ -210,6 +210,7 @@ OSDService::OSDService(OSD *osd) : pg_epoch_lock("OSDService::pg_epoch_lock"), publish_lock("OSDService::publish_lock"), pre_publish_lock("OSDService::pre_publish_lock"), + max_oldest_map(0), peer_map_epoch_lock("OSDService::peer_map_epoch_lock"), sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0), scrubs_active(0), @@ -1031,7 +1032,7 @@ MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to, OSDSuperblock& sblock) { MOSDMap *m = new MOSDMap(monc->get_fsid()); - m->oldest_map = sblock.oldest_map; + m->oldest_map = max_oldest_map.read(); m->newest_map = sblock.newest_map; for (epoch_t e = to; e > since; e--) { @@ -1071,7 +1072,7 @@ void OSDService::send_incremental_map(epoch_t since, Connection *con, if (since < sblock.oldest_map) { // just send latest full map MOSDMap *m = new MOSDMap(monc->get_fsid()); - m->oldest_map = sblock.oldest_map; + m->oldest_map = max_oldest_map.read(); m->newest_map = sblock.newest_map; get_map_bl(to, m->maps[to]); send_map(m, con); @@ -1809,6 +1810,9 @@ int OSD::init() dout(2) << "boot" << dendl; + int rotating_auth_attempts = 0; + const int max_rotating_auth_attempts = 10; + // read superblock r = read_superblock(); if (r < 0) { @@ -1936,6 +1940,7 @@ int OSD::init() service.init(); service.publish_map(osdmap); service.publish_superblock(superblock); + service.max_oldest_map.set(superblock.oldest_map); osd_lock.Unlock(); @@ -1949,6 +1954,14 @@ int OSD::init() while (monc->wait_auth_rotating(30.0) < 0) { derr << "unable to obtain rotating service keys; retrying" << dendl; + ++rotating_auth_attempts; + if (rotating_auth_attempts > max_rotating_auth_attempts) { + osd_lock.Lock(); // make locker happy + if (!is_stopping()) { + r = - ETIMEDOUT; + } + goto monout; + } } osd_lock.Lock(); @@ -2090,6 +2103,13 @@ void OSD::final_init() test_ops_hook, "inject metadata error"); assert(r == 0); + r = admin_socket->register_command( + "set_recovery_delay", + "set_recovery_delay " \ + "name=utime,type=CephInt,req=false", + test_ops_hook, + "Delay osd recovery by specified seconds"); + assert(r == 0); } void OSD::create_logger() @@ -2323,6 +2343,7 @@ int OSD::shutdown() cct->get_admin_socket()->unregister_command("truncobj"); cct->get_admin_socket()->unregister_command("injectdataerr"); cct->get_admin_socket()->unregister_command("injectmdataerr"); + cct->get_admin_socket()->unregister_command("set_recovery_delay"); delete test_ops_hook; test_ops_hook = NULL; @@ -2840,7 +2861,7 @@ void OSD::load_pgs() derr << __func__ << ": could not find map for epoch " << map_epoch << " on pg " << pgid << ", but the pool is not present in the " << "current map, so this is probably a result of bug 10617. " - << "Skipping the pg for now, you can use ceph_objectstore_tool " + << "Skipping the pg for now, you can use ceph-objectstore-tool " << "to clean it up later." << dendl; continue; } else { @@ -2965,8 +2986,11 @@ void OSD::build_past_intervals_parallel() PG *pg = i->second; epoch_t start, end; - if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map)) + if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map)) { + if (pg->info.history.same_interval_since == 0) + pg->info.history.same_interval_since = end; continue; + } dout(10) << pg->info.pgid << " needs " << start << "-" << end << dendl; pistate& p = pis[pg]; @@ -3052,6 +3076,21 @@ void OSD::build_past_intervals_parallel() } } + // Now that past_intervals have been recomputed let's fix the same_interval_since + // if it was cleared by import. + for (map::iterator i = pis.begin(); i != pis.end(); ++i) { + PG *pg = i->first; + pistate& p = i->second; + + if (pg->info.history.same_interval_since == 0) { + assert(p.same_interval_since); + dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl; + dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl; + // Fix it + pg->info.history.same_interval_since = p.same_interval_since; + } + } + // write info only at the end. this is necessary because we check // whether the past_intervals go far enough back or forward in time, // but we don't check for holes. we could avoid it by discarding @@ -4011,6 +4050,8 @@ void OSD::check_ops_in_flight() // truncobj [namespace/] // injectmdataerr [namespace/] // injectdataerr [namespace/] +// +// set_recovery_delay [utime] void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, std::string command, cmdmap_t& cmdmap, ostream &ss) { @@ -4133,6 +4174,24 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, } return; } + if (command == "set_recovery_delay") { + int64_t delay; + cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0); + ostringstream oss; + oss << delay; + int r = service->cct->_conf->set_val("osd_recovery_delay_start", + oss.str().c_str()); + if (r != 0) { + ss << "set_recovery_delay: error setting " + << "osd_recovery_delay_start to '" << delay << "': error " + << r; + return; + } + service->cct->_conf->apply_changes(NULL); + ss << "set_recovery_delay: set osd_recovery_delay_start " + << "to " << service->cct->_conf->osd_recovery_delay_start; + return; + } ss << "Internal error - command=" << command; return; } @@ -5382,6 +5441,7 @@ void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap) } else { register_session_waiting_on_map(session); } + session->maybe_reset_osdmap(); } @@ -5460,6 +5520,7 @@ void OSD::session_notify_pg_cleared( assert(session->session_dispatch_lock.is_locked()); update_waiting_for_pg(session, osdmap); session->waiting_for_pg.erase(pgid); + session->maybe_reset_osdmap(); clear_session_waiting_on_pg(session, pgid); } @@ -6106,6 +6167,48 @@ void OSD::osdmap_subscribe(version_t epoch, bool force_request) } } +void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) +{ + epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); + if (superblock.oldest_map >= min) + return; + + int num = 0; + ObjectStore::Transaction *t = NULL; + for (epoch_t e = superblock.oldest_map; e < min; ++e) { + dout(20) << " removing old osdmap epoch " << e << dendl; + if (!t) { + t = new ObjectStore::Transaction; + } + t->remove(META_COLL, get_osdmap_pobject_name(e)); + t->remove(META_COLL, get_inc_osdmap_pobject_name(e)); + superblock.oldest_map = e + 1; + num++; + if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) { + service.publish_superblock(superblock); + write_superblock(*t); + store->queue_transaction_and_cleanup(NULL, t); + t = NULL; + num = 0; + if (!skip_maps) { + // skip_maps leaves us with a range of old maps if we fail to remove all + // of them before moving superblock.oldest_map forward to the first map + // in the incoming MOSDMap msg. so we should continue removing them in + // this case, even we could do huge series of delete transactions all at + // once. + break; + } + } + } + if (num > 0) { + service.publish_superblock(superblock); + write_superblock(*t); + store->queue_transaction_and_cleanup(NULL, t); + } + // we should not remove the cached maps + assert(min <= service.map_cache.cached_key_lower_bound()); +} + void OSD::handle_osd_map(MOSDMap *m) { assert(osd_lock.is_locked()); @@ -6147,6 +6250,10 @@ void OSD::handle_osd_map(MOSDMap *m) logger->inc(l_osd_mape, last - first + 1); if (first <= osdmap->get_epoch()) logger->inc(l_osd_mape_dup, osdmap->get_epoch() - first + 1); + if (service.max_oldest_map.read() < m->oldest_map) { + service.max_oldest_map.set(m->oldest_map); + assert(service.max_oldest_map.read() >= superblock.oldest_map); + } // make sure there is something new, here, before we bother flushing the queues and such if (last <= osdmap->get_epoch()) { @@ -6243,6 +6350,9 @@ void OSD::handle_osd_map(MOSDMap *m) << " but failed to encode full with correct crc; requesting" << dendl; clog->warn() << "failed to encode map e" << e << " with expected crc\n"; + dout(20) << "my encoded map was:\n"; + fbl.hexdump(*_dout); + *_dout << dendl; delete o; MMonGetOSDMap *req = new MMonGetOSDMap; req->request_full(e, last); @@ -6273,20 +6383,8 @@ void OSD::handle_osd_map(MOSDMap *m) } if (superblock.oldest_map) { - int num = 0; - epoch_t min( - MIN(m->oldest_map, - service.map_cache.cached_key_lower_bound())); - for (epoch_t e = superblock.oldest_map; e < min; ++e) { - dout(20) << " removing old osdmap epoch " << e << dendl; - t.remove(META_COLL, get_osdmap_pobject_name(e)); - t.remove(META_COLL, get_inc_osdmap_pobject_name(e)); - superblock.oldest_map = e+1; - num++; - if (num >= cct->_conf->osd_target_transaction_size && - (uint64_t)num > (last - first)) // make sure we at least keep pace with incoming maps - break; - } + // make sure we at least keep pace with incoming maps + trim_maps(m->oldest_map, last - first + 1, skip_maps); } if (!superblock.oldest_map || skip_maps) @@ -6706,11 +6804,7 @@ void OSD::consume_map() for (set::iterator p = pgs_to_check.begin(); p != pgs_to_check.end(); ++p) { - vector acting; - int nrep = osdmap->pg_to_acting_osds(p->pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - if (role < 0) { + if (!(osdmap->is_acting_osd_shard(p->pgid, whoami, p->shard))) { set concerned_sessions; get_sessions_possibly_interested_in_pg(*p, &concerned_sessions); for (set::iterator i = concerned_sessions.begin(); diff --git a/ceph/src/osd/OSD.h b/ceph/src/osd/OSD.h index 3cc8df05..bfc6ff93 100644 --- a/ceph/src/osd/OSD.h +++ b/ceph/src/osd/OSD.h @@ -387,6 +387,7 @@ public: int get_nodeid() const { return whoami; } + atomic_t max_oldest_map; OSDMapRef osdmap; OSDMapRef get_osdmap() { Mutex::Locker l(publish_lock); @@ -1160,8 +1161,11 @@ public: sent_epoch_lock("Session::sent_epoch_lock"), last_sent_epoch(0), received_map_lock("Session::received_map_lock"), received_map_epoch(0) {} - - + void maybe_reset_osdmap() { + if (waiting_for_pg.empty()) { + osdmap.reset(); + } + } }; void update_waiting_for_pg(Session *session, OSDMapRef osdmap); void session_notify_pg_create(Session *session, OSDMapRef osdmap, spg_t pgid); @@ -1265,6 +1269,7 @@ public: */ session->waiting_on_map.clear(); session->waiting_for_pg.clear(); + session->osdmap.reset(); } void register_session_waiting_on_pg(Session *session, spg_t pgid) { Mutex::Locker l(session_waiting_lock); @@ -1658,6 +1663,7 @@ private: void wait_for_new_map(OpRequestRef op); void handle_osd_map(class MOSDMap *m); + void trim_maps(epoch_t oldest, int nreceived, bool skip_maps); void note_down_osd(int osd); void note_up_osd(int osd); diff --git a/ceph/src/osd/OSDMap.cc b/ceph/src/osd/OSDMap.cc index 173b468b..f7f37d74 100644 --- a/ceph/src/osd/OSDMap.cc +++ b/ceph/src/osd/OSDMap.cc @@ -2873,3 +2873,168 @@ int OSDMap::build_simple_crush_rulesets(CephContext *cct, // require the crush_v2 feature of clients return 0; } + +int OSDMap::summarize_mapping_stats( + OSDMap *newmap, + const set *pools, + std::string *out, + Formatter *f) const +{ + set ls; + if (pools) { + ls = *pools; + } else { + for (map::const_iterator i = get_pools().begin(); + i != get_pools().end(); + ++i) { + ls.insert(i->first); + } + } + + unsigned total_pg = 0; + unsigned moved_pg = 0; + vector base_by_osd(get_max_osd(), 0); + vector new_by_osd(get_max_osd(), 0); + for (set::iterator p = ls.begin(); p != ls.end(); ++p) { + int64_t pool_id = *p; + const pg_pool_t *pi = get_pg_pool(pool_id); + vector up, up2, acting; + int up_primary, acting_primary; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, pool_id, -1); + total_pg += pi->get_size(); + pg_to_up_acting_osds(pgid, &up, &up_primary, + &acting, &acting_primary); + for (vector::iterator q = up.begin(); q != up.end(); ++q) { + int osd = *q; + if (osd >= 0 && osd < get_max_osd()) + ++base_by_osd[osd]; + } + if (newmap) { + newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, + &acting, &acting_primary); + for (vector::iterator q = up2.begin(); q != up2.end(); ++q) { + int osd = *q; + if (osd >= 0 && osd < get_max_osd()) + ++new_by_osd[osd]; + } + if (pi->type == pg_pool_t::TYPE_ERASURE) { + for (unsigned i=0; itype == pg_pool_t::TYPE_REPLICATED) { + for (vector::iterator q = up.begin(); q != up.end(); ++q) { + int osd = *q; + if (std::find(up2.begin(), up2.end(), osd) == up2.end()) { + ++moved_pg; + } + } + } else { + assert(0 == "unhandled pool type"); + } + } + } + } + + unsigned num_up_in = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) + ++num_up_in; + } + if (!num_up_in) { + return -EINVAL; + } + + float avg_pg = (float)total_pg / (float)num_up_in; + float base_stddev = 0, new_stddev = 0; + int min = -1, max = -1; + unsigned min_base_pg = 0, max_base_pg = 0; + unsigned min_new_pg = 0, max_new_pg = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) { + float base_diff = (float)base_by_osd[osd] - avg_pg; + base_stddev += base_diff * base_diff; + float new_diff = (float)new_by_osd[osd] - avg_pg; + new_stddev += new_diff * new_diff; + if (min < 0 || min_base_pg < base_by_osd[osd]) { + min = osd; + min_base_pg = base_by_osd[osd]; + min_new_pg = new_by_osd[osd]; + } + if (max < 0 || max_base_pg > base_by_osd[osd]) { + max = osd; + max_base_pg = base_by_osd[osd]; + max_new_pg = new_by_osd[osd]; + } + } + } + base_stddev = sqrt(base_stddev / num_up_in); + new_stddev = sqrt(new_stddev / num_up_in); + + float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in))); + + ostringstream ss; + if (f) + f->open_object_section("utilization"); + if (newmap) { + if (f) { + f->dump_unsigned("moved_pgs", moved_pg); + f->dump_unsigned("total_pgs", total_pg); + } else { + ss << "moved " << moved_pg << " / " << total_pg + << " (" << ((float)moved_pg * 100.0 / (float)total_pg) << "%)\n"; + } + } + if (f) { + f->dump_float("avg_pgs", avg_pg); + f->dump_float("std_dev", base_stddev); + f->dump_float("expected_baseline_std_dev", edev); + if (newmap) + f->dump_float("new_std_dev", new_stddev); + } else { + ss << "avg " << avg_pg << "\n"; + ss << "stddev " << base_stddev; + if (newmap) + ss << " -> " << new_stddev; + ss << " (expected baseline " << edev << ")\n"; + } + if (min >= 0) { + if (f) { + f->dump_unsigned("min_osd", min); + f->dump_unsigned("min_osd_pgs", min_base_pg); + if (newmap) + f->dump_unsigned("new_min_osd_pgs", min_new_pg); + } else { + ss << "min osd." << min << " with " << min_base_pg; + if (newmap) + ss << " -> " << min_new_pg; + ss << " pgs (" << (float)min_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)min_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (max >= 0) { + if (f) { + f->dump_unsigned("max_osd", max); + f->dump_unsigned("max_osd_pgs", max_base_pg); + if (newmap) + f->dump_unsigned("new_max_osd_pgs", max_new_pg); + } else { + ss << "max osd." << max << " with " << max_base_pg; + if (newmap) + ss << " -> " << max_new_pg; + ss << " pgs (" << (float)max_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)max_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (f) + f->close_section(); + if (out) + *out = ss.str(); + return 0; +} diff --git a/ceph/src/osd/OSDMap.h b/ceph/src/osd/OSDMap.h index acdc3af4..272bd7d0 100644 --- a/ceph/src/osd/OSDMap.h +++ b/ceph/src/osd/OSDMap.h @@ -293,6 +293,9 @@ public: pg_temp.reset(new map >(*o.pg_temp)); osd_uuid.reset(new vector(*o.osd_uuid)); + if (o.osd_primary_affinity) + osd_primary_affinity.reset(new vector<__u32>(*o.osd_primary_affinity)); + // NOTE: this still references shared entity_addr_t's. osd_addrs.reset(new addrs_s(*o.osd_addrs)); @@ -768,6 +771,15 @@ public: return group[group.size()-1]; return -1; // we fail! } + bool is_acting_osd_shard(pg_t pg, int osd, shard_id_t shard) const { + vector acting; + int nrep = pg_to_acting_osds(pg, acting); + if (shard == shard_id_t::NO_SHARD) + return calc_pg_role(osd, acting, nrep) >= 0; + if (shard >= (int)acting.size()) + return false; + return acting[shard] == osd; + } /* what replica # is a given osd? 0 primary, -1 for none. */ @@ -848,6 +860,12 @@ public: void print_oneline_summary(ostream& out) const; void print_tree(ostream *out, Formatter *f) const; + int summarize_mapping_stats( + OSDMap *newmap, + const set *pools, + std::string *out, + Formatter *f) const; + string get_flag_string() const; static string get_flag_string(unsigned flags); static void dump_erasure_code_profiles(const map > &profiles, diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc index 634dc565..b8e7f274 100644 --- a/ceph/src/osd/PG.cc +++ b/ceph/src/osd/PG.cc @@ -158,8 +158,18 @@ void PGPool::update(OSDMapRef map) name = map->get_pool_name(id); if (pi->get_snap_epoch() == map->get_epoch()) { pi->build_removed_snaps(newly_removed_snaps); - newly_removed_snaps.subtract(cached_removed_snaps); - cached_removed_snaps.union_of(newly_removed_snaps); + interval_set intersection; + intersection.intersection_of(newly_removed_snaps, cached_removed_snaps); + if (intersection == cached_removed_snaps) { + newly_removed_snaps.subtract(cached_removed_snaps); + cached_removed_snaps.union_of(newly_removed_snaps); + } else { + lgeneric_subdout(g_ceph_context, osd, 0) << __func__ + << " cached_removed_snaps shrank from " << cached_removed_snaps + << " to " << newly_removed_snaps << dendl; + cached_removed_snaps = newly_removed_snaps; + newly_removed_snaps.clear(); + } snapc = pi->get_snap_context(); } else { newly_removed_snaps.clear(); @@ -647,7 +657,12 @@ bool PG::needs_backfill() const bool PG::_calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_map) { - *end = info.history.same_interval_since; + if (info.history.same_interval_since) { + *end = info.history.same_interval_since; + } else { + // PG must be imported, so let's calculate the whole range. + *end = osdmap_ref->get_epoch(); + } // Do we already have the intervals we want? map::const_iterator pif = past_intervals.begin(); @@ -678,6 +693,8 @@ void PG::generate_past_intervals() epoch_t cur_epoch, end_epoch; if (!_calc_past_interval_range(&cur_epoch, &end_epoch, osd->get_superblock().oldest_map)) { + if (info.history.same_interval_since == 0) + info.history.same_interval_since = end_epoch; return; } @@ -732,6 +749,15 @@ void PG::generate_past_intervals() } } + // PG import needs recalculated same_interval_since + if (info.history.same_interval_since == 0) { + assert(same_interval_since); + dout(10) << __func__ << " fix same_interval_since " << same_interval_since << " pg " << *this << dendl; + dout(10) << __func__ << " past_intervals " << past_intervals << dendl; + // Fix it + info.history.same_interval_since = same_interval_since; + } + // record our work. dirty_info = true; dirty_big_info = true; @@ -1550,7 +1576,16 @@ void PG::activate(ObjectStore::Transaction& t, dout(20) << "activate - purged_snaps " << info.purged_snaps << " cached_removed_snaps " << pool.cached_removed_snaps << dendl; snap_trimq = pool.cached_removed_snaps; - snap_trimq.subtract(info.purged_snaps); + interval_set intersection; + intersection.intersection_of(snap_trimq, info.purged_snaps); + if (intersection == info.purged_snaps) { + snap_trimq.subtract(info.purged_snaps); + } else { + dout(0) << "warning: info.purged_snaps (" << info.purged_snaps + << ") is not a subset of pool.cached_removed_snaps (" + << pool.cached_removed_snaps << ")" << dendl; + snap_trimq.subtract(intersection); + } dout(10) << "activate - snap_trimq " << snap_trimq << dendl; if (!snap_trimq.empty() && is_clean()) queue_snap_trim(); @@ -2153,10 +2188,12 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) // Info child->info.history = info.history; + child->info.history.epoch_created = get_osdmap()->get_epoch(); child->info.purged_snaps = info.purged_snaps; child->info.last_backfill = info.last_backfill; child->info.stats = info.stats; + child->info.stats.parent_split_bits = split_bits; info.stats.stats_invalid = true; child->info.stats.stats_invalid = true; child->info.last_epoch_started = info.last_epoch_started; @@ -2179,6 +2216,12 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) if (get_primary() != child->get_primary()) child->info.history.same_primary_since = get_osdmap()->get_epoch(); + child->info.stats.up = up; + child->info.stats.up_primary = up_primary; + child->info.stats.acting = acting; + child->info.stats.acting_primary = primary; + child->info.stats.mapping_epoch = get_osdmap()->get_epoch(); + // History child->past_intervals = past_intervals; @@ -3600,8 +3643,18 @@ void PG::_scan_snaps(ScrubMap &smap) if (hoid.snap < CEPH_MAXSNAP) { // fake nlinks for old primaries bufferlist bl; + if (o.attrs.find(OI_ATTR) == o.attrs.end()) { + o.nlinks = 0; + continue; + } bl.push_back(o.attrs[OI_ATTR]); - object_info_t oi(bl); + object_info_t oi; + try { + oi = bl; + } catch(...) { + o.nlinks = 0; + continue; + } if (oi.snaps.empty()) { // Just head o.nlinks = 1; @@ -3715,15 +3768,20 @@ void PG::repair_object( assert(waiting_for_unreadable_object.empty()); pg_log.missing_add(soid, oi.version, eversion_t()); + + pg_log.set_last_requested(0); + dout(10) << __func__ << ": primary = " << primary << dendl; + } + + if (is_ec_pg() || bad_peer == primary) { + // we'd better collect all shard for EC pg, and prepare good peers as the + // source of pull in the case of replicated pg. missing_loc.add_missing(soid, oi.version, eversion_t()); list >::iterator i; for (i = ok_peers->begin(); - i != ok_peers->end(); - ++i) + i != ok_peers->end(); + ++i) missing_loc.add_location(soid, i->second); - - pg_log.set_last_requested(0); - dout(10) << __func__ << ": primary = " << primary << dendl; } } @@ -4759,6 +4817,7 @@ void PG::start_peering_interval( info.history.same_interval_since = osdmap->get_epoch(); } else { std::stringstream debug; + assert(info.history.same_interval_since != 0); boost::scoped_ptr recoverable( get_is_recoverable_predicate()); bool new_interval = pg_interval_t::check_new_interval( diff --git a/ceph/src/osd/PG.h b/ceph/src/osd/PG.h index e06b9102..935e7444 100644 --- a/ceph/src/osd/PG.h +++ b/ceph/src/osd/PG.h @@ -361,10 +361,6 @@ public: return ret; } - const map &get_all_missing() const { - return needs_recovery_map; - } - void clear() { needs_recovery_map.clear(); missing_loc.clear(); diff --git a/ceph/src/osd/PGBackend.cc b/ceph/src/osd/PGBackend.cc index 2b897d7f..ef8012df 100644 --- a/ceph/src/osd/PGBackend.cc +++ b/ceph/src/osd/PGBackend.cc @@ -664,7 +664,7 @@ void PGBackend::be_compare_scrubmaps( } if (auth_object.digest_present && auth_object.omap_digest_present && - (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) { + (!auth_oi.is_data_digest() || (!auth_oi.is_omap_digest() && auth_oi.is_omap()))) { dout(20) << __func__ << " missing digest on " << *k << dendl; update = MAYBE; } diff --git a/ceph/src/osd/PGLog.cc b/ceph/src/osd/PGLog.cc index b619bcd9..8521af9e 100644 --- a/ceph/src/osd/PGLog.cc +++ b/ceph/src/osd/PGLog.cc @@ -526,6 +526,9 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead if (info.last_complete > newhead) info.last_complete = newhead; + if (log.rollback_info_trimmed_to > newhead) + log.rollback_info_trimmed_to = newhead; + log.index(); map new_priors; @@ -742,10 +745,12 @@ void PGLog::write_log( ObjectStore::Transaction& t, const coll_t& coll, const ghobject_t &log_oid) { if (is_dirty()) { - dout(10) << "write_log with: " + dout(5) << "write_log with: " << "dirty_to: " << dirty_to << ", dirty_from: " << dirty_from - << ", dirty_divergent_priors: " << dirty_divergent_priors + << ", dirty_divergent_priors: " + << (dirty_divergent_priors ? "true" : "false") + << ", divergent_priors: " << divergent_priors.size() << ", writeout_from: " << writeout_from << ", trimmed: " << trimmed << dendl; diff --git a/ceph/src/osd/ReplicatedBackend.cc b/ceph/src/osd/ReplicatedBackend.cc index 64815739..5136fa58 100644 --- a/ceph/src/osd/ReplicatedBackend.cc +++ b/ceph/src/osd/ReplicatedBackend.cc @@ -1193,8 +1193,6 @@ void ReplicatedBackend::sub_op_modify_impl(OpRequestRef op) rm->bytes_written = rm->opt.get_encoded_bytes(); - op->mark_started(); - rm->localt.append(rm->opt); rm->localt.register_on_commit( parent->bless_context( diff --git a/ceph/src/osd/ReplicatedPG.cc b/ceph/src/osd/ReplicatedPG.cc index 2c3f12dd..1676a3e3 100644 --- a/ceph/src/osd/ReplicatedPG.cc +++ b/ceph/src/osd/ReplicatedPG.cc @@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) p != info.hit_set.history.end(); ++p) { if (stamp >= p->begin && stamp <= p->end) { - oid = get_hit_set_archive_object(p->begin, p->end); + oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); break; } } @@ -1805,6 +1805,7 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, bool can_proxy_read = get_osdmap()->get_up_osd_features() & CEPH_FEATURE_OSD_PROXY_FEATURES; OpRequestRef promote_op; + bool did_proxy_read = false; switch (pool.info.cache_mode) { case pg_pool_t::CACHEMODE_WRITEBACK: @@ -1832,10 +1833,12 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, return true; } - if (can_proxy_read) + if (can_proxy_read) { do_proxy_read(op); - else + did_proxy_read = true; + } else { promote_op = op; // for non-proxy case promote_object needs this + } // Avoid duplicate promotion if (obc.get() && obc->is_blocked()) { @@ -1877,7 +1880,7 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, promote_object(obc, missing_oid, oloc, promote_op); } else { // not promoting - return false; + return did_proxy_read; } break; } @@ -2677,13 +2680,19 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid) object_info_t &coi = obc->obs.oi; set old_snaps(coi.snaps.begin(), coi.snaps.end()); - assert(old_snaps.size()); + if (old_snaps.empty()) { + osd->clog->error() << __func__ << " No object info snaps for " << coid << "\n"; + return NULL; + } SnapSet& snapset = obc->ssc->snapset; dout(10) << coid << " old_snaps " << old_snaps << " old snapset " << snapset << dendl; - assert(snapset.seq); + if (snapset.seq == 0) { + osd->clog->error() << __func__ << " No snapset.seq for " << coid << "\n"; + return NULL; + } RepGather *repop = simple_repop_create(obc); OpContext *ctx = repop->ctx; @@ -2712,7 +2721,10 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid) for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p) if (*p == last) break; - assert(p != snapset.clones.end()); + if (p == snapset.clones.end()) { + osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones" << "\n"; + return NULL; + } object_stat_sum_t delta; delta.num_bytes -= snapset.get_clone_bytes(last); @@ -4209,8 +4221,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges, op.extent.offset, op.extent.length, true); maybe_create_new_object(ctx); - if (op.extent.offset == 0 && op.extent.length == oi.size) + if (op.extent.offset == 0 && op.extent.length >= oi.size) obs.oi.set_data_digest(osd_op.indata.crc32c(-1)); + else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) + obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest)); else obs.oi.clear_data_digest(); } @@ -6272,10 +6286,11 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r) return; } - if (cop->omap_data.length()) + if (cop->omap_data.length() || cop->omap_header.length()) cop->results.has_omap = true; - if (r >= 0 && pool.info.require_rollback() && cop->omap_data.length()) { + if (r >= 0 && pool.info.require_rollback() && + (cop->omap_data.length() || cop->omap_header.length())) { r = -EOPNOTSUPP; } cop->objecter_tid = 0; @@ -7692,6 +7707,10 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch) ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref dout(10) << "handle_watch_timeout obc " << obc << dendl; + if (!is_active()) { + dout(10) << "handle_watch_timeout not active, no-op" << dendl; + return; + } if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) { callbacks_for_degraded_object[obc->obs.oi.soid].push_back( watch->get_delayed_cb() @@ -8556,9 +8575,9 @@ void ReplicatedPG::mark_all_unfound_lost(int what) info.last_update.epoch = get_osdmap()->get_epoch(); const pg_missing_t &missing = pg_log.get_missing(); map::const_iterator m = - missing_loc.get_all_missing().begin(); + missing_loc.get_needs_recovery().begin(); map::const_iterator mend = - missing_loc.get_all_missing().end(); + missing_loc.get_needs_recovery().end(); while (m != mend) { const hobject_t &oid(m->first); if (!missing_loc.is_unfound(oid)) { @@ -10158,10 +10177,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp) return hoid; } -hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end) +hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt) { ostringstream ss; - ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end; + ss << "hit_set_" << info.pgid.pgid << "_archive_"; + if (using_gmt) { + start.gmtime(ss) << "_"; + end.gmtime(ss); + } else { + start.localtime(ss) << "_"; + end.localtime(ss); + } hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", info.pgid.ps(), info.pgid.pool(), cct->_conf->osd_hit_set_namespace); @@ -10298,7 +10326,7 @@ void ReplicatedPG::hit_set_persist() for (list::iterator p = info.hit_set.history.begin(); p != info.hit_set.history.end(); ++p) { - hobject_t aoid = get_hit_set_archive_object(p->begin, p->end); + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); // Once we hit a degraded object just skip further trim if (is_degraded_or_backfilling_object(aoid)) @@ -10307,10 +10335,8 @@ void ReplicatedPG::hit_set_persist() return; } - oid = get_hit_set_archive_object(start, now); + oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset); // If the current object is degraded we skip this persist request - if (is_degraded_or_backfilling_object(oid)) - return; if (scrubber.write_blocked_by_scrub(oid)) return; @@ -10401,7 +10427,7 @@ void ReplicatedPG::hit_set_persist() updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info); hit_set_create(); - updated_hit_set_hist.current_info = pg_hit_set_info_t(); + updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset); updated_hit_set_hist.current_last_stamp = utime_t(); // fabricate an object_info_t and SnapSet @@ -10464,7 +10490,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { list::iterator p = updated_hit_set_hist.history.begin(); assert(p != updated_hit_set_hist.history.end()); - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); assert(!is_degraded_or_backfilling_object(oid)); @@ -10749,7 +10775,7 @@ void ReplicatedPG::agent_load_hit_sets() continue; } - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); if (is_unreadable_object(oid)) { dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; break; @@ -11233,6 +11259,85 @@ void ReplicatedPG::_scrub_digest_updated() } } +static bool doing_clones(const boost::optional &snapset, + const vector::reverse_iterator &curclone) { + return snapset && curclone != snapset.get().clones.rend(); +} + +void ReplicatedPG::log_missing(unsigned missing, + const boost::optional &head, + LogChannelRef clog, + const spg_t &pgid, + const char *func, + const char *mode, + bool allow_incomplete_clones) +{ + assert(head); + if (allow_incomplete_clones) { + dout(20) << func << " " << mode << " " << pgid << " " << head.get() + << " skipped " << missing << " clone(s) in cache tier" << dendl; + } else { + clog->info() << mode << " " << pgid << " " << head.get() + << " " << missing << " missing clone(s)"; + } +} + +unsigned ReplicatedPG::process_clones_to(const boost::optional &head, + const boost::optional &snapset, + LogChannelRef clog, + const spg_t &pgid, + const char *mode, + bool allow_incomplete_clones, + boost::optional target, + vector::reverse_iterator *curclone) +{ + assert(head); + assert(snapset); + unsigned missing = 0; + + // NOTE: clones are in descending order, thus **curclone > target test here + hobject_t next_clone(head.get()); + while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) { + ++missing; + // it is okay to be missing one or more clones in a cache tier. + // skip higher-numbered clones in the list. + if (!allow_incomplete_clones) { + next_clone.snap = **curclone; + clog->error() << mode << " " << pgid << " " << head.get() + << " expected clone " << next_clone; + ++scrubber.shallow_errors; + } + // Clones are descending + ++(*curclone); + } + return missing; +} + +/* + * Validate consistency of the object info and snap sets. + * + * We are sort of comparing 2 lists. The main loop is on objmap.objects. But + * the comparison of the objects is against multiple snapset.clones. There are + * multiple clone lists and in between lists we expect head or snapdir. + * + * Example + * + * objects expected + * ======= ======= + * obj1 snap 1 head/snapdir, unexpected obj1 snap 1 + * obj2 head head/snapdir, head ok + * [SnapSet clones 6 4 2 1] + * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7 + * obj2 snap 6 obj2 snap 6, match + * obj2 snap 4 obj2 snap 4, match + * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok + * [Snapset clones 3 1] + * obj3 snap 3 obj3 snap 3 match + * obj3 snap 1 obj3 snap 1 match + * obj4 snapdir head/snapdir, snapdir ok + * [Snapset clones 4] + * EOL obj4 snap 4, (expected) + */ void ReplicatedPG::_scrub( ScrubMap &scrubmap, const map > &missing_digest) @@ -11243,191 +11348,260 @@ void ReplicatedPG::_scrub( bool repair = state_test(PG_STATE_REPAIR); bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + boost::optional all_clones; // Unspecified snapid_t or boost::none // traverse in reverse order. - hobject_t head; - SnapSet snapset; - vector::reverse_iterator curclone; - hobject_t next_clone; + boost::optional head; + boost::optional snapset; // If initialized so will head (above) + vector::reverse_iterator curclone; // Defined only if snapset initialized + bool missing = false; bufferlist last_data; - for (map::reverse_iterator p = scrubmap.objects.rbegin(); - p != scrubmap.objects.rend(); - ++p) { + for (map::reverse_iterator + p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) { const hobject_t& soid = p->first; object_stat_sum_t stat; - if (soid.snap != CEPH_SNAPDIR) + boost::optional oi; + + if (!soid.is_snapdir()) stat.num_objects++; if (soid.nspace == cct->_conf->osd_hit_set_namespace) stat.num_objects_hit_set_archive++; - // new snapset? - if (soid.snap == CEPH_SNAPDIR || - soid.snap == CEPH_NOSNAP) { - if (p->second.attrs.count(SS_ATTR) == 0) { - osd->clog->error() << mode << " " << info.pgid << " " << soid - << " no '" << SS_ATTR << "' attr"; - ++scrubber.shallow_errors; - continue; - } - bufferlist bl; - bl.push_back(p->second.attrs[SS_ATTR]); - bufferlist::iterator blp = bl.begin(); - ::decode(snapset, blp); - - // did we finish the last oid? - if (head != hobject_t() && - !pool.info.allow_incomplete_clones()) { - osd->clog->error() << mode << " " << info.pgid << " " << head - << " missing clones"; - ++scrubber.shallow_errors; - } - - // what will be next? - if (snapset.clones.empty()) - head = hobject_t(); // no clones. - else { - curclone = snapset.clones.rbegin(); - head = p->first; - next_clone = hobject_t(); - dout(20) << " snapset " << snapset << dendl; - } + if (soid.is_snap()) { + // it's a clone + stat.num_object_clones++; } // basic checks. if (p->second.attrs.count(OI_ATTR) == 0) { + oi = boost::none; osd->clog->error() << mode << " " << info.pgid << " " << soid << " no '" << OI_ATTR << "' attr"; ++scrubber.shallow_errors; - continue; + } else { + bufferlist bv; + bv.push_back(p->second.attrs[OI_ATTR]); + try { + oi = object_info_t(); // Initialize optional<> before decode into it + oi.get().decode(bv); + } catch (buffer::error& e) { + oi = boost::none; + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " can't decode '" << OI_ATTR << "' attr " << e.what(); + ++scrubber.shallow_errors; + } } - bufferlist bv; - bv.push_back(p->second.attrs[OI_ATTR]); - object_info_t oi(bv); - if (pgbackend->be_get_ondisk_size(oi.size) != p->second.size) { - osd->clog->error() << mode << " " << info.pgid << " " << soid - << " on disk size (" << p->second.size - << ") does not match object info size (" - << oi.size << ") adjusted for ondisk to (" - << pgbackend->be_get_ondisk_size(oi.size) - << ")"; - ++scrubber.shallow_errors; - } + if (oi) { + if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " on disk size (" << p->second.size + << ") does not match object info size (" + << oi->size << ") adjusted for ondisk to (" + << pgbackend->be_get_ondisk_size(oi->size) + << ")"; + ++scrubber.shallow_errors; + } - dout(20) << mode << " " << soid << " " << oi << dendl; + dout(20) << mode << " " << soid << " " << oi.get() << dendl; - if (soid.is_snap()) { - stat.num_bytes += snapset.get_clone_bytes(soid.snap); - } else { - stat.num_bytes += oi.size; + // A clone num_bytes will be added later when we have snapset + if (!soid.is_snap()) { + stat.num_bytes += oi->size; + } + if (soid.nspace == cct->_conf->osd_hit_set_namespace) + stat.num_bytes_hit_set_archive += oi->size; + + if (!soid.is_snapdir()) { + if (oi->is_dirty()) + ++stat.num_objects_dirty; + if (oi->is_whiteout()) + ++stat.num_whiteouts; + if (oi->is_omap()) + ++stat.num_objects_omap; + } } - if (soid.nspace == cct->_conf->osd_hit_set_namespace) - stat.num_bytes_hit_set_archive += oi.size; - - if (!soid.is_snapdir()) { - if (oi.is_dirty()) - ++stat.num_objects_dirty; - if (oi.is_whiteout()) - ++stat.num_whiteouts; - if (oi.is_omap()) - ++stat.num_objects_omap; - } - - if (!next_clone.is_min() && next_clone != soid && - pool.info.allow_incomplete_clones()) { - // it is okay to be missing one or more clones in a cache tier. - // skip higher-numbered clones in the list. - while (curclone != snapset.clones.rend() && - soid.snap < *curclone) - ++curclone; - if (curclone != snapset.clones.rend() && - soid.snap == *curclone) { - dout(20) << __func__ << " skipped some clones in cache tier" << dendl; - next_clone.snap = *curclone; - } - if (curclone == snapset.clones.rend() || - soid.snap == CEPH_NOSNAP) { - dout(20) << __func__ << " skipped remaining clones in cache tier" - << dendl; - next_clone = hobject_t(); - head = hobject_t(); + + // Check for any problems while processing clones + if (doing_clones(snapset, curclone)) { + boost::optional target; + // Expecting an object with snap for current head + if (soid.has_snapset() || soid.get_head() != head->get_head()) { + + dout(10) << __func__ << " " << mode << " " << info.pgid << " new object " + << soid << " while processing " << head.get() << dendl; + + target = all_clones; + } else { + assert(soid.is_snap()); + target = soid.snap; } + + // Log any clones we were expecting to be there up to target + // This will set missing, but will be a no-op if snap.soid == *curclone. + missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode, + pool.info.allow_incomplete_clones(), target, &curclone); } - if (!next_clone.is_min() && next_clone != soid) { + bool expected; + // Check doing_clones() again in case we ran process_clones_to() + if (doing_clones(snapset, curclone)) { + // A head/snapdir would have processed all clones above + // or all greater than *curclone. + assert(soid.is_snap() && *curclone <= soid.snap); + + // After processing above clone snap should match the expected curclone + expected = (*curclone == soid.snap); + } else { + // If we aren't doing clones any longer, then expecting head/snapdir + expected = soid.has_snapset(); + } + if (!expected) { + // If we couldn't read the head's snapset, then just ignore clones and + // don't count as an error. + if (head && !snapset) { + osd->clog->info() << mode << " " << info.pgid << " " << soid + << " clone ignored due to missing snapset"; + continue; + } osd->clog->error() << mode << " " << info.pgid << " " << soid - << " expected clone " << next_clone; + << " is an unexpected clone"; ++scrubber.shallow_errors; + continue; } - if (soid.snap == CEPH_NOSNAP || soid.snap == CEPH_SNAPDIR) { - if (soid.snap == CEPH_NOSNAP && !snapset.head_exists) { - osd->clog->error() << mode << " " << info.pgid << " " << soid - << " snapset.head_exists=false, but head exists"; - ++scrubber.shallow_errors; + // new snapset? + if (soid.has_snapset()) { + + if (missing) { + log_missing(missing, head, osd->clog, info.pgid, __func__, mode, + pool.info.allow_incomplete_clones()); } - if (soid.snap == CEPH_SNAPDIR && snapset.head_exists) { + + // Set this as a new head object + head = soid; + missing = false; + + dout(20) << __func__ << " " << mode << " new head " << head << dendl; + + if (p->second.attrs.count(SS_ATTR) == 0) { osd->clog->error() << mode << " " << info.pgid << " " << soid - << " snapset.head_exists=true, but snapdir exists"; + << " no '" << SS_ATTR << "' attr"; ++scrubber.shallow_errors; - } - if (curclone == snapset.clones.rend()) { - next_clone = hobject_t(); + snapset = boost::none; } else { - next_clone = soid; - next_clone.snap = *curclone; - } - } else if (soid.snap) { - // it's a clone - stat.num_object_clones++; - - if (head == hobject_t()) { - osd->clog->error() << mode << " " << info.pgid << " " << soid - << " found clone without head"; - ++scrubber.shallow_errors; - continue; + bufferlist bl; + bl.push_back(p->second.attrs[SS_ATTR]); + bufferlist::iterator blp = bl.begin(); + try { + snapset = SnapSet(); // Initialize optional<> before decoding into it + ::decode(snapset.get(), blp); + } catch (buffer::error& e) { + snapset = boost::none; + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " can't decode '" << SS_ATTR << "' attr " << e.what(); + ++scrubber.shallow_errors; + } } - if (soid.snap != *curclone) { - continue; // we warn above. we could do better here... + if (snapset) { + // what will be next? + curclone = snapset->clones.rbegin(); + + if (!snapset->clones.empty()) { + dout(20) << " snapset " << snapset.get() << dendl; + if (snapset->seq == 0) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " snaps.seq not set"; + ++scrubber.shallow_errors; + } + } + + if (soid.is_head() && !snapset->head_exists) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " snapset.head_exists=false, but head exists"; + ++scrubber.shallow_errors; + } + if (soid.is_snapdir() && snapset->head_exists) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " snapset.head_exists=true, but snapdir exists"; + ++scrubber.shallow_errors; + } } + } else { + assert(soid.is_snap()); + assert(head); + assert(snapset); + assert(soid.snap == *curclone); - if (oi.size != snapset.clone_size[*curclone]) { + dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl; + + if (snapset->clone_size.count(soid.snap) == 0) { osd->clog->error() << mode << " " << info.pgid << " " << soid - << " size " << oi.size << " != clone_size " - << snapset.clone_size[*curclone]; + << " is missing in clone_size"; ++scrubber.shallow_errors; - } + } else { + if (oi && oi->size != snapset->clone_size[soid.snap]) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " size " << oi->size << " != clone_size " + << snapset->clone_size[*curclone]; + ++scrubber.shallow_errors; + } - // verify overlap? - // ... + if (snapset->clone_overlap.count(soid.snap) == 0) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " is missing in clone_overlap"; + ++scrubber.shallow_errors; + } else { + // This checking is based on get_clone_bytes(). The first 2 asserts + // can't happen because we know we have a clone_size and + // a clone_overlap. Now we check that the interval_set won't + // cause the last assert. + uint64_t size = snapset->clone_size.find(soid.snap)->second; + const interval_set &overlap = + snapset->clone_overlap.find(soid.snap)->second; + bool bad_interval_set = false; + for (interval_set::const_iterator i = overlap.begin(); + i != overlap.end(); ++i) { + if (size < i.get_len()) { + bad_interval_set = true; + break; + } + size -= i.get_len(); + } - // what's next? - if (curclone != snapset.clones.rend()) { - ++curclone; - } - if (curclone == snapset.clones.rend()) { - head = hobject_t(); - next_clone = hobject_t(); - } else { - next_clone.snap = *curclone; + if (bad_interval_set) { + osd->clog->error() << mode << " " << info.pgid << " " << soid + << " bad interval_set in clone_overlap"; + ++scrubber.shallow_errors; + } else { + stat.num_bytes += snapset->get_clone_bytes(soid.snap); + } + } } - } else { - // it's unversioned. - next_clone = hobject_t(); + // what's next? + ++curclone; } scrub_cstat.add(stat); } - if (!next_clone.is_min() && - !pool.info.allow_incomplete_clones()) { - osd->clog->error() << mode << " " << info.pgid - << " expected clone " << next_clone; - ++scrubber.shallow_errors; + if (doing_clones(snapset, curclone)) { + dout(10) << __func__ << " " << mode << " " << info.pgid + << " No more objects while processing " << head.get() << dendl; + + missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode, + pool.info.allow_incomplete_clones(), all_clones, &curclone); + + } + // There could be missing found by the test above or even + // before dropping out of the loop for the last head. + if (missing) { + log_missing(missing, head, osd->clog, info.pgid, __func__, + mode, pool.info.allow_incomplete_clones()); } for (map >::const_iterator p = @@ -11453,7 +11627,7 @@ void ReplicatedPG::_scrub( simple_repop_submit(repop); ++scrubber.num_digest_updates_pending; } - + dout(10) << "_scrub (" << mode << ") finish" << dendl; } diff --git a/ceph/src/osd/ReplicatedPG.h b/ceph/src/osd/ReplicatedPG.h index 48e0def3..0894be69 100644 --- a/ceph/src/osd/ReplicatedPG.h +++ b/ceph/src/osd/ReplicatedPG.h @@ -903,7 +903,9 @@ protected: void hit_set_in_memory_trim(); ///< discard old in memory HitSets hobject_t get_hit_set_current_object(utime_t stamp); - hobject_t get_hit_set_archive_object(utime_t start, utime_t end); + hobject_t get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt); // agent boost::scoped_ptr agent_state; @@ -1421,6 +1423,22 @@ private: uint64_t temp_seq; ///< last id for naming temp objects coll_t get_temp_coll(ObjectStore::Transaction *t); hobject_t generate_temp_object(); ///< generate a new temp object name + void log_missing(unsigned missing, + const boost::optional &head, + LogChannelRef clog, + const spg_t &pgid, + const char *func, + const char *mode, + bool allow_incomplete_clones); + unsigned process_clones_to(const boost::optional &head, + const boost::optional &snapset, + LogChannelRef clog, + const spg_t &pgid, + const char *mode, + bool allow_incomplete_clones, + boost::optional target, + vector::reverse_iterator *curclone); + public: void get_colls(list *out) { out->push_back(coll); diff --git a/ceph/src/osd/osd_types.cc b/ceph/src/osd/osd_types.cc index 29f4fc4c..b13925c6 100644 --- a/ceph/src/osd/osd_types.cc +++ b/ceph/src/osd/osd_types.cc @@ -926,6 +926,7 @@ void pg_pool_t::dump(Formatter *f) const f->close_section(); // hit_set_params f->dump_unsigned("hit_set_period", hit_set_period); f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_bool("use_gmt_hitset", use_gmt_hitset); f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); f->dump_unsigned("stripe_width", get_stripe_width()); f->dump_unsigned("expected_num_objects", expected_num_objects); @@ -1238,7 +1239,60 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(17, 5, bl); + if ((features & CEPH_FEATURE_OSD_HITSET_GMT) == 0) { + // CEPH_FEATURE_OSD_HITSET_GMT requires pg_pool_t v21 which has + // use_gmt_hitset, and two fields added before v21. it's backward + // compatible, but re-encoding the same osdmap with different ceph + // versions causes CRC mismatch at the OSD side, the tracker#12410 + // prevents the monitor from sending the single full map requested + // by OSD. so we need a way to encode pg_pool_t the same old way. + ENCODE_START(17, 5, bl); + ::encode(type, bl); + ::encode(size, bl); + ::encode(crush_ruleset, bl); + ::encode(object_hash, bl); + ::encode(pg_num, bl); + ::encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + ::encode(lpg_num, bl); + ::encode(lpgp_num, bl); + ::encode(last_change, bl); + ::encode(snap_seq, bl); + ::encode(snap_epoch, bl); + ::encode(snaps, bl, features); + ::encode(removed_snaps, bl); + ::encode(auid, bl); + ::encode(flags, bl); + ::encode(crash_replay_interval, bl); + ::encode(min_size, bl); + ::encode(quota_max_bytes, bl); + ::encode(quota_max_objects, bl); + ::encode(tiers, bl); + ::encode(tier_of, bl); + __u8 c = cache_mode; + ::encode(c, bl); + ::encode(read_tier, bl); + ::encode(write_tier, bl); + ::encode(properties, bl); + ::encode(hit_set_params, bl); + ::encode(hit_set_period, bl); + ::encode(hit_set_count, bl); + ::encode(stripe_width, bl); + ::encode(target_max_bytes, bl); + ::encode(target_max_objects, bl); + ::encode(cache_target_dirty_ratio_micro, bl); + ::encode(cache_target_full_ratio_micro, bl); + ::encode(cache_min_flush_age, bl); + ::encode(cache_min_evict_age, bl); + ::encode(erasure_code_profile, bl); + ::encode(last_force_op_resend, bl); + ::encode(min_read_recency_for_promote, bl); + ::encode(expected_num_objects, bl); + ENCODE_FINISH(bl); + return; + } + + ENCODE_START(21, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -1280,12 +1334,15 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(last_force_op_resend, bl); ::encode(min_read_recency_for_promote, bl); ::encode(expected_num_objects, bl); + ::encode(uint32_t(.6 * 1e6), bl); + ::encode(uint32_t(1), bl); + ::encode(use_gmt_hitset, bl); ENCODE_FINISH(bl); } void pg_pool_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl); ::decode(type, bl); ::decode(size, bl); ::decode(crush_ruleset, bl); @@ -1397,6 +1454,19 @@ void pg_pool_t::decode(bufferlist::iterator& bl) } else { expected_num_objects = 0; } + if (struct_v >= 19) { + uint32_t dummy; + ::decode(dummy, bl); + } + if (struct_v >= 20) { + uint32_t dummy; + ::decode(dummy, bl); + } + if (struct_v >= 21) { + ::decode(use_gmt_hitset, bl); + } else { + use_gmt_hitset = false; + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -2640,6 +2710,7 @@ bool pg_interval_t::check_new_interval( pg_interval_t& i = (*past_intervals)[same_interval_since]; i.first = same_interval_since; i.last = osdmap->get_epoch() - 1; + assert(i.first <= i.last); i.acting = old_acting; i.up = old_up; i.primary = old_acting_primary; @@ -3127,6 +3198,12 @@ void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap, for (list::const_iterator i = in.log.begin(); i != in.log.end(); ++i) { + // Reject pg log entries for temporary objects + if (i->soid.is_temp()) { + reject.log.push_back(*i); + continue; + } + if (i->soid.nspace != hit_set_namespace) { object_t oid = i->soid.oid; object_locator_t loc(i->soid); @@ -3789,19 +3866,25 @@ void pg_create_t::generate_test_instances(list& o) void pg_hit_set_info_t::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(begin, bl); ::encode(end, bl); ::encode(version, bl); + ::encode(using_gmt, bl); ENCODE_FINISH(bl); } void pg_hit_set_info_t::decode(bufferlist::iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); ::decode(begin, p); ::decode(end, p); ::decode(version, p); + if (struct_v >= 2) { + ::decode(using_gmt, p); + } else { + using_gmt = false; + } DECODE_FINISH(p); } @@ -3810,6 +3893,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const f->dump_stream("begin") << begin; f->dump_stream("end") << end; f->dump_stream("version") << version; + f->dump_stream("using_gmt") << using_gmt; } void pg_hit_set_info_t::generate_test_instances(list& ls) diff --git a/ceph/src/osd/osd_types.h b/ceph/src/osd/osd_types.h index b058c135..92f61632 100644 --- a/ceph/src/osd/osd_types.h +++ b/ceph/src/osd/osd_types.h @@ -1035,6 +1035,7 @@ public: HitSet::Params hit_set_params; ///< The HitSet params to use on this pool uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds) uint32_t hit_set_count; ///< number of periods to retain + bool use_gmt_hitset; ///< use gmt to name the hitset archive object uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote uint32_t stripe_width; ///< erasure coded stripe size in bytes @@ -1063,6 +1064,7 @@ public: hit_set_params(), hit_set_period(0), hit_set_count(0), + use_gmt_hitset(true), min_read_recency_for_promote(0), stripe_width(0), expected_num_objects(0) @@ -1600,10 +1602,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) struct pg_hit_set_info_t { utime_t begin, end; ///< time interval eversion_t version; ///< version this HitSet object was written - - pg_hit_set_info_t() {} - pg_hit_set_info_t(utime_t b) - : begin(b) {} + bool using_gmt; ///< use gmt for creating the hit_set archive object name + pg_hit_set_info_t(bool using_gmt = true) + : using_gmt(using_gmt) {} + pg_hit_set_info_t(utime_t b, bool using_gmt) + : begin(b), using_gmt(using_gmt) {} void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); @@ -2987,6 +2990,10 @@ struct object_info_t { object_info_t(bufferlist& bl) { decode(bl); } + object_info_t operator=(bufferlist& bl) { + decode(bl); + return *this; + } }; WRITE_CLASS_ENCODER(object_info_t) diff --git a/ceph/src/osdc/Objecter.cc b/ceph/src/osdc/Objecter.cc index 2b5a979f..95700eb1 100644 --- a/ceph/src/osdc/Objecter.cc +++ b/ceph/src/osdc/Objecter.cc @@ -629,7 +629,8 @@ int Objecter::linger_check(LingerOp *info) << " age " << age << dendl; if (info->last_error) return info->last_error; - return age.to_msec(); + // return a safe upper bound (we are truncating to ms) + return 1 + age.to_msec(); } void Objecter::linger_cancel(LingerOp *info) @@ -801,10 +802,13 @@ void Objecter::handle_watch_notify(MWatchNotify *m) info->notify_id != m->notify_id) { ldout(cct, 10) << __func__ << " reply notify " << m->notify_id << " != " << info->notify_id << ", ignoring" << dendl; - } else { - assert(info->on_notify_finish); + } else if (info->on_notify_finish) { info->notify_result_bl->claim(m->get_data()); info->on_notify_finish->complete(m->return_code); + + // if we race with reconnect we might get a second notify; only + // notify the caller once! + info->on_notify_finish = NULL; } } else { finisher->queue(new C_DoWatchNotify(this, info, m)); diff --git a/ceph/src/rgw/rgw_admin.cc b/ceph/src/rgw/rgw_admin.cc index 2ba77e33..60051482 100644 --- a/ceph/src/rgw/rgw_admin.cc +++ b/ceph/src/rgw/rgw_admin.cc @@ -112,8 +112,8 @@ void _usage() cerr << " replicalog get get replica metadata log entry\n"; cerr << " replicalog update update replica metadata log entry\n"; cerr << " replicalog delete delete replica metadata log entry\n"; - cout << " orphans find init and run search for leaked rados objects\n"; - cout << " orphans finish clean up search for leaked rados objects\n"; + cerr << " orphans find init and run search for leaked rados objects\n"; + cerr << " orphans finish clean up search for leaked rados objects\n"; cerr << "options:\n"; cerr << " --uid= user id\n"; cerr << " --subuser= subuser name\n"; @@ -168,7 +168,11 @@ void _usage() cerr << " --categories= comma separated list of categories, used in usage show\n"; cerr << " --caps= list of caps (e.g., \"usage=read, write; user=read\"\n"; cerr << " --yes-i-really-mean-it required for certain operations\n"; - cerr << " --reset-regions reset regionmap when regionmap update"; + cerr << " --reset-regions reset regionmap when regionmap update\n"; + cerr << " --bypass-gc when specified with bucket deletion, triggers\n"; + cerr << " object deletions by not involving GC\n"; + cerr << " --inconsistent-index when specified with bucket deletion and bypass-gc set to true,\n"; + cerr << " ignores bucket index consistency\n"; cerr << "\n"; cerr << " := \"YYYY-MM-DD[ hh:mm:ss]\"\n"; cerr << "\nQuota options:\n"; @@ -176,9 +180,9 @@ void _usage() cerr << " --max-objects specify max objects (negative value to disable)\n"; cerr << " --max-size specify max size (in bytes, negative value to disable)\n"; cerr << " --quota-scope scope of quota (bucket, user)\n"; - cout << "\nOrphans search options:\n"; - cout << " --pool data pool to scan for leaked rados objects in\n"; - cout << " --num-shards num of shards to use for keeping the temporary scan info\n"; + cerr << "\nOrphans search options:\n"; + cerr << " --pool data pool to scan for leaked rados objects in\n"; + cerr << " --num-shards num of shards to use for keeping the temporary scan info\n"; cerr << "\n"; generic_client_usage(); } @@ -1163,6 +1167,9 @@ int main(int argc, char **argv) int max_concurrent_ios = 32; uint64_t orphan_stale_secs = (24 * 3600); + int bypass_gc = false; + int inconsistent_index = false; + std::string val; std::ostringstream errs; string err; @@ -1325,6 +1332,10 @@ int main(int argc, char **argv) // do nothing } else if (ceph_argparse_binary_flag(args, i, &reset_regions, NULL, "--reset-regions", (char*)NULL)) { // do nothing + } else if (ceph_argparse_binary_flag(args, i, &bypass_gc, NULL, "--bypass-gc", (char*)NULL)) { + // do nothing + } else if (ceph_argparse_binary_flag(args, i, &inconsistent_index, NULL, "--inconsistent-index", (char*)NULL)) { + // do nothing } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) { caps = val; } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infile", (char*)NULL)) { @@ -1726,6 +1737,8 @@ int main(int argc, char **argv) bucket_op.set_object(object); bucket_op.set_check_objects(check_objects); bucket_op.set_delete_children(delete_child_objects); + bucket_op.set_fix_index(fix); + bucket_op.set_max_aio(max_concurrent_ios); // required to gather errors from operations std::string err_msg; @@ -2547,7 +2560,11 @@ next: } if (opt_cmd == OPT_BUCKET_RM) { - RGWBucketAdminOp::remove_bucket(store, bucket_op); + if (inconsistent_index == false) { + RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, true); + } else { + RGWBucketAdminOp::remove_bucket(store, bucket_op, bypass_gc, false); + } } if (opt_cmd == OPT_GC_LIST) { diff --git a/ceph/src/rgw/rgw_bucket.cc b/ceph/src/rgw/rgw_bucket.cc index 5d2af38e..6384c7fa 100644 --- a/ceph/src/rgw/rgw_bucket.cc +++ b/ceph/src/rgw/rgw_bucket.cc @@ -17,6 +17,7 @@ #include "rgw_user.h" #include "rgw_string.h" +#include "include/rados/librados.hpp" // until everything is moved from rgw_common #include "rgw_common.h" @@ -138,9 +139,6 @@ int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t ret = store->get_bucket_entrypoint_info(obj_ctx, bucket_name, ep, &ot, NULL, &attrs); if (ret < 0 && ret != -ENOENT) { ldout(store->ctx(), 0) << "ERROR: store->get_bucket_entrypoint_info() returned " << ret << dendl; - } else if (ret >= 0 && ep.linked && ep.owner != user_id) { - ldout(store->ctx(), 0) << "can't link bucket, already linked to a different user: " << ep.owner << dendl; - return -EINVAL; } } @@ -389,9 +387,7 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b map stats; std::vector objs; map common_prefixes; - rgw_obj obj; RGWBucketInfo info; - bufferlist bl; RGWObjectCtx obj_ctx(store); string bucket_ver, master_ver; @@ -400,8 +396,6 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b if (ret < 0) return ret; - obj.bucket = bucket; - ret = store->get_bucket_info(obj_ctx, bucket.name, info, NULL); if (ret < 0) return ret; @@ -420,7 +414,7 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b while (!objs.empty()) { std::vector::iterator it = objs.begin(); - for (it = objs.begin(); it != objs.end(); ++it) { + for (; it != objs.end(); ++it) { ret = rgw_remove_object(store, info, bucket, (*it).key); if (ret < 0) return ret; @@ -433,6 +427,11 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b } } + ret = rgw_bucket_sync_user_stats(store, bucket.name); + if ( ret < 0) { + dout(1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + RGWObjVersionTracker objv_tracker; ret = store->delete_bucket(bucket, objv_tracker); @@ -449,6 +448,173 @@ int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& b return ret; } +static int aio_wait(librados::AioCompletion *handle) +{ + librados::AioCompletion *c = (librados::AioCompletion *)handle; + c->wait_for_complete(); + int ret = c->get_return_value(); + c->release(); + return ret; +} + +static int drain_handles(list& pending) +{ + int ret = 0; + while (!pending.empty()) { + librados::AioCompletion *handle = pending.front(); + pending.pop_front(); + int r = aio_wait(handle); + if (r < 0) { + ret = r; + } + } + return ret; +} + +int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket, + int concurrent_max, bool keep_index_consistent) +{ + int ret; + map stats; + std::vector objs; + map common_prefixes; + RGWBucketInfo info; + RGWObjectCtx obj_ctx(store); + + string bucket_ver, master_ver; + + ret = store->get_bucket_stats(bucket, &bucket_ver, &master_ver, stats, NULL); + if (ret < 0) + return ret; + + ret = store->get_bucket_info(obj_ctx, bucket.name, info, NULL); + if (ret < 0) + return ret; + + + RGWRados::Bucket target(store, info.bucket); + RGWRados::Bucket::List list_op(&target); + + list_op.params.list_versions = true; + + std::list handles; + + int max = 1000; + int max_aio = concurrent_max; + ret = list_op.list_objects(max, &objs, &common_prefixes, NULL); + if (ret < 0) + return ret; + + while (!objs.empty()) { + std::vector::iterator it = objs.begin(); + for (; it != objs.end(); ++it) { + RGWObjState *astate = NULL; + rgw_obj obj(bucket, (*it).key.name); + obj.set_instance((*it).key.instance); + + ret = store->get_obj_state(&obj_ctx, obj, &astate, NULL); + if (ret == -ENOENT) { + dout(1) << "WARNING: cannot find obj state for obj " << obj.get_object() << dendl; + continue; + } + if (ret < 0) { + lderr(store->ctx()) << "ERROR: get obj state returned with error " << ret << dendl; + return ret; + } + + if (astate->has_manifest) { + rgw_obj head_obj; + RGWObjManifest& manifest = astate->manifest; + RGWObjManifest::obj_iterator miter = manifest.obj_begin(); + + if (miter.get_location().ns.empty()) { + head_obj = miter.get_location(); + } + + for (; miter != manifest.obj_end() && max_aio--; ++miter) { + if (!max_aio) { + ret = drain_handles(handles); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + max_aio = concurrent_max; + } + + rgw_obj last_obj = miter.get_location(); + if (last_obj == head_obj) { + // have the head obj deleted at the end + continue; + } + + ret = store->delete_obj_aio(last_obj, bucket, info, astate, handles, keep_index_consistent); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: delete obj aio failed with " << ret << dendl; + return ret; + } + } // for all shadow objs + + ret = store->delete_obj_aio(head_obj, bucket, info, astate, handles, keep_index_consistent); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: delete obj aio failed with " << ret << dendl; + return ret; + } + } + + if (!max_aio) { + ret = drain_handles(handles); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + max_aio = concurrent_max; + } + } // for all RGW objects + objs.clear(); + + ret = list_op.list_objects(max, &objs, &common_prefixes, NULL); + if (ret < 0) + return ret; + } + + ret = drain_handles(handles); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl; + return ret; + } + + ret = rgw_bucket_sync_user_stats(store, bucket.name); + if (ret < 0) { + dout(1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl; + } + + RGWObjVersionTracker objv_tracker; + + ret = rgw_bucket_delete_bucket_obj(store, bucket.name, objv_tracker); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << "with ret as " << ret << dendl; + return ret; + } + + if (!store->is_syncing_bucket_meta(bucket)) { + RGWObjVersionTracker objv_tracker; + string entry; + store->get_bucket_instance_entry(bucket, entry); + ret = rgw_bucket_instance_remove_entry(store, entry, &objv_tracker); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: could not remove bucket instance entry" << bucket.name << "with ret as " << ret << dendl; + return ret; + } + } + + ret = rgw_unlink_bucket(store, info.owner, bucket.name, false); + if (ret < 0) { + lderr(store->ctx()) << "ERROR: unable to remove user bucket information" << dendl; + } + + return ret; +} + int rgw_bucket_delete_bucket_obj(RGWRados *store, string& bucket_name, RGWObjVersionTracker& objv_tracker) { return store->meta_mgr->remove_entry(bucket_meta_handler, bucket_name, &objv_tracker); @@ -542,7 +708,7 @@ int RGWBucket::link(RGWBucketAdminOpState& op_state, std::string *err_msg) return -EIO; } - r = rgw_unlink_bucket(store, owner.get_id(), bucket.name); + r = rgw_unlink_bucket(store, owner.get_id(), bucket.name, false); if (r < 0) { set_err_msg(err_msg, "could not unlink policy from user " + owner.get_id()); return r; @@ -569,6 +735,17 @@ int RGWBucket::link(RGWBucketAdminOpState& op_state, std::string *err_msg) if (r < 0) return r; + RGWAccessControlPolicy policy_instance; + policy_instance.create_default(user_info.user_id, display_name); + aclbl.clear(); + policy_instance.encode(aclbl); + + string oid_bucket_instance = RGW_BUCKET_INSTANCE_MD_PREFIX + key; + rgw_bucket bucket_instance; + bucket_instance.name = oid_bucket_instance; + rgw_obj obj_bucket_instance(bucket_instance, no_oid); + r = store->set_attr(NULL, obj_bucket_instance, RGW_ATTR_ACL, aclbl, &objv_tracker); + r = rgw_link_bucket(store, user_info.user_id, bucket, 0); if (r < 0) return r; @@ -594,12 +771,24 @@ int RGWBucket::unlink(RGWBucketAdminOpState& op_state, std::string *err_msg) return r; } -int RGWBucket::remove(RGWBucketAdminOpState& op_state, std::string *err_msg) +int RGWBucket::remove(RGWBucketAdminOpState& op_state, bool bypass_gc, + bool keep_index_consistent, std::string *err_msg) { bool delete_children = op_state.will_delete_children(); rgw_bucket bucket = op_state.get_bucket(); + int ret; + + if (bypass_gc) { + if (delete_children) { + ret = rgw_remove_bucket_bypass_gc(store, bucket, op_state.get_max_aio(), keep_index_consistent); + } else { + set_err_msg(err_msg, "purge objects should be set for gc to be bypassed"); + return -EINVAL; + } + } else { + ret = rgw_remove_bucket(store, bucket_info.owner, bucket, delete_children); + } - int ret = rgw_remove_bucket(store, bucket_info.owner, bucket, delete_children); if (ret < 0) { set_err_msg(err_msg, "unable to remove bucket" + cpp_strerror(-ret)); return ret; @@ -967,7 +1156,8 @@ int RGWBucketAdminOp::check_index(RGWRados *store, RGWBucketAdminOpState& op_sta return 0; } -int RGWBucketAdminOp::remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state) +int RGWBucketAdminOp::remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state, + bool bypass_gc, bool keep_index_consistent) { RGWBucket bucket; @@ -975,7 +1165,7 @@ int RGWBucketAdminOp::remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_s if (ret < 0) return ret; - return bucket.remove(op_state); + return bucket.remove(op_state, bypass_gc, keep_index_consistent); } int RGWBucketAdminOp::remove_object(RGWRados *store, RGWBucketAdminOpState& op_state) @@ -1552,7 +1742,11 @@ public: int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker, time_t mtime, JSONObj *obj, sync_type_t sync_type) { RGWBucketEntryPoint be, old_be; - decode_json_obj(be, obj); + try { + decode_json_obj(be, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } time_t orig_mtime; map attrs; @@ -1712,7 +1906,11 @@ public: int put(RGWRados *store, string& oid, RGWObjVersionTracker& objv_tracker, time_t mtime, JSONObj *obj, sync_type_t sync_type) { RGWBucketCompleteInfo bci, old_bci; - decode_json_obj(bci, obj); + try { + decode_json_obj(bci, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } time_t orig_mtime; RGWObjectCtx obj_ctx(store); diff --git a/ceph/src/rgw/rgw_bucket.h b/ceph/src/rgw/rgw_bucket.h index 830e096c..5c9db262 100644 --- a/ceph/src/rgw/rgw_bucket.h +++ b/ceph/src/rgw/rgw_bucket.h @@ -112,6 +112,7 @@ extern int rgw_unlink_bucket(RGWRados *store, string user_id, const string& buck extern int rgw_remove_object(RGWRados *store, RGWBucketInfo& bucket_info, rgw_bucket& bucket, rgw_obj_key& key); extern int rgw_remove_bucket(RGWRados *store, const string& bucket_owner, rgw_bucket& bucket, bool delete_children); +extern int rgw_remove_bucket_bypass_gc(RGWRados *store, rgw_bucket& bucket, int concurrent_max); extern int rgw_bucket_set_attrs(RGWRados *store, RGWBucketInfo& bucket_info, map& attrs, @@ -133,6 +134,7 @@ struct RGWBucketAdminOpState { bool fix_index; bool delete_child_objects; bool bucket_stored; + int max_aio; rgw_bucket bucket; @@ -141,6 +143,8 @@ struct RGWBucketAdminOpState { void set_fix_index(bool value) { fix_index = value; } void set_delete_children(bool value) { delete_child_objects = value; } + void set_max_aio(int value) { max_aio = value; } + void set_user_id(std::string& user_id) { if (!user_id.empty()) uid = user_id; @@ -175,6 +179,7 @@ struct RGWBucketAdminOpState { bool is_user_op() { return !uid.empty(); } bool is_system_op() { return uid.empty(); } bool has_bucket_stored() { return bucket_stored; } + int get_max_aio() { return max_aio; } RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false), fix_index(false), delete_child_objects(false), @@ -215,7 +220,7 @@ public: map& calculated_stats, std::string *err_msg = NULL); - int remove(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); + int remove(RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true, std::string *err_msg = NULL); int link(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); int unlink(RGWBucketAdminOpState& op_state, std::string *err_msg = NULL); @@ -241,7 +246,7 @@ public: static int check_index(RGWRados *store, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher); - static int remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state); + static int remove_bucket(RGWRados *store, RGWBucketAdminOpState& op_state, bool bypass_gc = false, bool keep_index_consistent = true); static int remove_object(RGWRados *store, RGWBucketAdminOpState& op_state); static int info(RGWRados *store, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher); }; diff --git a/ceph/src/rgw/rgw_civetweb.cc b/ceph/src/rgw/rgw_civetweb.cc index 5c075f98..20388687 100644 --- a/ceph/src/rgw/rgw_civetweb.cc +++ b/ceph/src/rgw/rgw_civetweb.cc @@ -27,7 +27,8 @@ int RGWMongoose::write_data(const char *buf, int len) return r; } -RGWMongoose::RGWMongoose(mg_connection *_conn, int _port) : conn(_conn), port(_port), header_done(false), sent_header(false), has_content_length(false), +RGWMongoose::RGWMongoose(mg_connection *_conn, int _port) : conn(_conn), port(_port), status_num(0), header_done(false), + sent_header(false), has_content_length(false), explicit_keepalive(false), explicit_conn_close(false) { } @@ -45,9 +46,23 @@ int RGWMongoose::complete_request() { if (!sent_header) { if (!has_content_length) { + header_done = false; /* let's go back to writing the header */ - if (0 && data.length() == 0) { + /* + * Status 204 should not include a content-length header + * RFC7230 says so + * + * Same goes for status 304: Not Modified + * + * 'If a cache uses a received 304 response to update a cache entry,' + * 'the cache MUST update the entry to reflect any new field values' + * 'given in the response.' + * + */ + if (status_num == 204 || status_num == 304) { + has_content_length = true; + } else if (0 && data.length() == 0) { has_content_length = true; print("Transfer-Enconding: %s\r\n", "chunked"); data.append("0\r\n\r\n", sizeof("0\r\n\r\n")-1); @@ -75,6 +90,7 @@ void RGWMongoose::init_env(CephContext *cct) { env.init(cct); struct mg_request_info *info = mg_get_request_info(conn); + if (!info) return; @@ -114,7 +130,7 @@ void RGWMongoose::init_env(CephContext *cct) *dest = c; } *dest = '\0'; - + env.set(buf, header->value); } @@ -136,21 +152,21 @@ void RGWMongoose::init_env(CephContext *cct) } } -int RGWMongoose::send_status(const char *status, const char *status_name) +int RGWMongoose::send_status(int status, const char *status_name) { char buf[128]; if (!status_name) status_name = ""; - snprintf(buf, sizeof(buf), "HTTP/1.1 %s %s\r\n", status, status_name); + snprintf(buf, sizeof(buf), "HTTP/1.1 %d %s\r\n", status, status_name); bufferlist bl; bl.append(buf); bl.append(header_data); header_data = bl; - int status_num = atoi(status); + status_num = status; mg_set_http_status(conn, status_num); return 0; diff --git a/ceph/src/rgw/rgw_civetweb.h b/ceph/src/rgw/rgw_civetweb.h index 1cee5c8b..bf7d64ad 100644 --- a/ceph/src/rgw/rgw_civetweb.h +++ b/ceph/src/rgw/rgw_civetweb.h @@ -19,6 +19,7 @@ class RGWMongoose : public RGWClientIO bufferlist data; int port; + int status_num; bool header_done; bool sent_header; @@ -32,7 +33,7 @@ public: int write_data(const char *buf, int len); int read_data(char *buf, int len); - int send_status(const char *status, const char *status_name); + int send_status(int status, const char *status_name); int send_100_continue(); int complete_header(); int complete_request(); diff --git a/ceph/src/rgw/rgw_client_io.h b/ceph/src/rgw/rgw_client_io.h index dc90db3d..8448390c 100644 --- a/ceph/src/rgw/rgw_client_io.h +++ b/ceph/src/rgw/rgw_client_io.h @@ -34,7 +34,7 @@ public: virtual void flush() = 0; int read(char *buf, int max, int *actual); - virtual int send_status(const char *status, const char *status_name) = 0; + virtual int send_status(int status, const char *status_name) = 0; virtual int send_100_continue() = 0; virtual int complete_header() = 0; virtual int complete_request() = 0; diff --git a/ceph/src/rgw/rgw_cors.cc b/ceph/src/rgw/rgw_cors.cc index a120a686..1ad5b431 100644 --- a/ceph/src/rgw/rgw_cors.cc +++ b/ceph/src/rgw/rgw_cors.cc @@ -116,6 +116,13 @@ static bool is_string_in_set(set& s, string h) { return false; } +bool RGWCORSRule::has_wildcard_origin() { + if (allowed_origins.find("*") != allowed_origins.end()) + return true; + + return false; +} + bool RGWCORSRule::is_origin_present(const char *o) { string origin = o; return is_string_in_set(allowed_origins, origin); diff --git a/ceph/src/rgw/rgw_cors.h b/ceph/src/rgw/rgw_cors.h index 124ebf92..c5877ea5 100644 --- a/ceph/src/rgw/rgw_cors.h +++ b/ceph/src/rgw/rgw_cors.h @@ -81,6 +81,7 @@ public: ::decode(exposable_hdrs, bl); DECODE_FINISH(bl); } + bool has_wildcard_origin(); bool is_origin_present(const char *o); void format_exp_headers(std::string& s); void erase_origin_if_present(std::string& origin, bool *rule_empty); diff --git a/ceph/src/rgw/rgw_fcgi.cc b/ceph/src/rgw/rgw_fcgi.cc index 0006834b..bbc19d81 100644 --- a/ceph/src/rgw/rgw_fcgi.cc +++ b/ceph/src/rgw/rgw_fcgi.cc @@ -31,14 +31,15 @@ void RGWFCGX::init_env(CephContext *cct) env.init(cct, (char **)fcgx->envp); } -int RGWFCGX::send_status(const char *status, const char *status_name) +int RGWFCGX::send_status(int status, const char *status_name) { - return print("Status: %s %s\r\n", status, status_name); + status_num = status; + return print("Status: %d %s\r\n", status, status_name); } int RGWFCGX::send_100_continue() { - int r = send_status("100", "Continue"); + int r = send_status(100, "Continue"); if (r >= 0) { flush(); } @@ -47,6 +48,13 @@ int RGWFCGX::send_100_continue() int RGWFCGX::send_content_length(uint64_t len) { + /* + * Status 204 should not include a content-length header + * RFC7230 says so + */ + if (status_num == 204) + return 0; + char buf[21]; snprintf(buf, sizeof(buf), "%" PRIu64, len); return print("Content-Length: %s\r\n", buf); @@ -56,4 +64,3 @@ int RGWFCGX::complete_header() { return print("\r\n"); } - diff --git a/ceph/src/rgw/rgw_fcgi.h b/ceph/src/rgw/rgw_fcgi.h index 8fc96fce..88889b50 100644 --- a/ceph/src/rgw/rgw_fcgi.h +++ b/ceph/src/rgw/rgw_fcgi.h @@ -13,18 +13,21 @@ struct FCGX_Request; class RGWFCGX : public RGWClientIO { FCGX_Request *fcgx; + + int status_num; + protected: void init_env(CephContext *cct); int write_data(const char *buf, int len); int read_data(char *buf, int len); - int send_status(const char *status, const char *status_name); + int send_status(int status, const char *status_name); int send_100_continue(); int complete_header(); int complete_request() { return 0; } int send_content_length(uint64_t len); public: - RGWFCGX(FCGX_Request *_fcgx) : fcgx(_fcgx) {} + RGWFCGX(FCGX_Request *_fcgx) : fcgx(_fcgx), status_num(0) {} void flush(); }; diff --git a/ceph/src/rgw/rgw_http_errors.h b/ceph/src/rgw/rgw_http_errors.h index 7850807b..bd169223 100644 --- a/ceph/src/rgw/rgw_http_errors.h +++ b/ceph/src/rgw/rgw_http_errors.h @@ -128,6 +128,8 @@ static inline int rgw_http_error_to_errno(int http_err) if (http_err >= 200 && http_err <= 299) return 0; switch (http_err) { + case 304: + return -ERR_NOT_MODIFIED; case 400: return -EINVAL; case 401: diff --git a/ceph/src/rgw/rgw_json_enc.cc b/ceph/src/rgw/rgw_json_enc.cc index d09fa65e..e8113357 100644 --- a/ceph/src/rgw/rgw_json_enc.cc +++ b/ceph/src/rgw/rgw_json_enc.cc @@ -747,6 +747,7 @@ void RGWRegionMap::decode_json(JSONObj *obj) { JSONDecoder::decode_json("regions", regions, obj); JSONDecoder::decode_json("master_region", master_region, obj); + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); JSONDecoder::decode_json("user_quota", user_quota, obj); } diff --git a/ceph/src/rgw/rgw_loadgen.cc b/ceph/src/rgw/rgw_loadgen.cc index 59b94625..14f63a19 100644 --- a/ceph/src/rgw/rgw_loadgen.cc +++ b/ceph/src/rgw/rgw_loadgen.cc @@ -92,7 +92,7 @@ void RGWLoadGenIO::init_env(CephContext *cct) env.set("SERVER_PORT", port_buf); } -int RGWLoadGenIO::send_status(const char *status, const char *status_name) +int RGWLoadGenIO::send_status(int status, const char *status_name) { return 0; } diff --git a/ceph/src/rgw/rgw_loadgen.h b/ceph/src/rgw/rgw_loadgen.h index 5a65ff43..e5636ed3 100644 --- a/ceph/src/rgw/rgw_loadgen.h +++ b/ceph/src/rgw/rgw_loadgen.h @@ -34,7 +34,7 @@ public: int write_data(const char *buf, int len); int read_data(char *buf, int len); - int send_status(const char *status, const char *status_name); + int send_status(int status, const char *status_name); int send_100_continue(); int complete_header(); int complete_request(); diff --git a/ceph/src/rgw/rgw_main.cc b/ceph/src/rgw/rgw_main.cc index 0ddd9de2..fc12a704 100644 --- a/ceph/src/rgw/rgw_main.cc +++ b/ceph/src/rgw/rgw_main.cc @@ -1165,6 +1165,16 @@ int main(int argc, const char **argv) for (list::iterator iter = frontends.begin(); iter != frontends.end(); ++iter) { string& f = *iter; + if (f.find("civetweb") != string::npos) { + if (f.find("port") != string::npos) { + // check for the most common ws problems + if ((f.find("port=") == string::npos) || + (f.find("port= ") != string::npos)) { + derr << "WARNING: civetweb frontend config found unexpected spacing around 'port' (ensure civetweb port parameter has the form 'port=80' with no spaces before or after '=')" << dendl; + } + } + } + RGWFrontendConfig *config = new RGWFrontendConfig(f); int r = config->init(); if (r < 0) { diff --git a/ceph/src/rgw/rgw_metadata.cc b/ceph/src/rgw/rgw_metadata.cc index ece9ebf2..e42290a8 100644 --- a/ceph/src/rgw/rgw_metadata.cc +++ b/ceph/src/rgw/rgw_metadata.cc @@ -355,9 +355,13 @@ int RGWMetadataManager::put(string& metadata_key, bufferlist& bl, time_t mtime = 0; - JSONDecoder::decode_json("key", metadata_key, &parser); - JSONDecoder::decode_json("ver", *objv, &parser); - JSONDecoder::decode_json("mtime", mtime, &parser); + try { + JSONDecoder::decode_json("key", metadata_key, &parser); + JSONDecoder::decode_json("ver", *objv, &parser); + JSONDecoder::decode_json("mtime", mtime, &parser); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } JSONObj *jo = parser.find_obj("data"); if (!jo) { diff --git a/ceph/src/rgw/rgw_op.cc b/ceph/src/rgw/rgw_op.cc index cd8785f1..ad467100 100644 --- a/ceph/src/rgw/rgw_op.cc +++ b/ceph/src/rgw/rgw_op.cc @@ -634,6 +634,18 @@ bool RGWOp::generate_cors_headers(string& origin, string& method, string& header if (!rule) return false; + /* + * Set the Allowed-Origin header to a asterisk if this is allowed in the rule + * and no Authorization was send by the client + * + * The origin parameter specifies a URI that may access the resource. The browser must enforce this. + * For requests without credentials, the server may specify "*" as a wildcard, + * thereby allowing any origin to access the resource. + */ + const char *authorization = s->info.env->get("HTTP_AUTHORIZATION"); + if (!authorization && rule->has_wildcard_origin()) + origin = "*"; + /* CORS 6.2.3. */ const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); if (!req_meth) { @@ -942,6 +954,7 @@ void RGWGetObj::execute() ret = handle_user_manifest(attr_iter->second.c_str()); if (ret < 0) { ldout(s->cct, 0) << "ERROR: failed to handle user manifest ret=" << ret << dendl; + goto done_err; } return; } @@ -1471,6 +1484,11 @@ void RGWDeleteBucket::execute() } } + ret = rgw_bucket_sync_user_stats(store, s->user.user_id, s->bucket); + if ( ret < 0) { + ldout(s->cct, 1) << "WARNING: failed to sync user stats before bucket delete: ret= " << ret << dendl; + } + ret = store->delete_bucket(s->bucket, ot); if (ret == 0) { @@ -1684,12 +1702,13 @@ static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data do { void *handle; + rgw_obj obj; - int ret = processor->handle_data(data, ofs, hash, &handle, &again); + int ret = processor->handle_data(data, ofs, hash, &handle, &obj, &again); if (ret < 0) return ret; - ret = processor->throttle_data(handle, need_to_wait); + ret = processor->throttle_data(handle, obj, need_to_wait); if (ret < 0) return ret; @@ -2433,6 +2452,7 @@ void RGWCopyObj::execute() if_match, if_nomatch, attrs_mod, + copy_if_newer, attrs, RGW_OBJ_CATEGORY_MAIN, olh_epoch, (version_id.empty() ? NULL : &version_id), diff --git a/ceph/src/rgw/rgw_op.h b/ceph/src/rgw/rgw_op.h index c4a64aec..fd83401f 100644 --- a/ceph/src/rgw/rgw_op.h +++ b/ceph/src/rgw/rgw_op.h @@ -602,6 +602,7 @@ protected: string version_id; uint64_t olh_epoch; + bool copy_if_newer; int init_common(); @@ -624,6 +625,7 @@ public: attrs_mod = RGWRados::ATTRSMOD_NONE; last_ofs = 0; olh_epoch = 0; + copy_if_newer = false; } static bool parse_copy_location(const string& src, string& bucket_name, rgw_obj_key& object); diff --git a/ceph/src/rgw/rgw_quota.cc b/ceph/src/rgw/rgw_quota.cc index 910da2ff..70f45cb2 100644 --- a/ceph/src/rgw/rgw_quota.cc +++ b/ceph/src/rgw/rgw_quota.cc @@ -664,9 +664,14 @@ class RGWQuotaHandlerImpl : public RGWQuotaHandler { RGWRados *store; RGWBucketStatsCache bucket_stats_cache; RGWUserStatsCache user_stats_cache; + RGWQuotaInfo def_bucket_quota; + RGWQuotaInfo def_user_quota; int check_quota(const char *entity, RGWQuotaInfo& quota, RGWStorageStats& stats, uint64_t num_objs, uint64_t size_kb) { + if (!quota.enabled) + return 0; + ldout(store->ctx(), 20) << entity << " quota: max_objects=" << quota.max_objects << " max_size_kb=" << quota.max_size_kb << dendl; @@ -687,12 +692,29 @@ class RGWQuotaHandlerImpl : public RGWQuotaHandler { return 0; } public: - RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), bucket_stats_cache(_store), user_stats_cache(_store, quota_threads) {} + RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), bucket_stats_cache(_store), user_stats_cache(_store, quota_threads) { + if (store->ctx()->_conf->rgw_bucket_default_quota_max_objects >= 0) { + def_bucket_quota.max_objects = store->ctx()->_conf->rgw_bucket_default_quota_max_objects; + def_bucket_quota.enabled = true; + } + if (store->ctx()->_conf->rgw_bucket_default_quota_max_size >= 0) { + def_bucket_quota.max_size_kb = store->ctx()->_conf->rgw_bucket_default_quota_max_size; + def_bucket_quota.enabled = true; + } + if (store->ctx()->_conf->rgw_user_default_quota_max_objects >= 0) { + def_user_quota.max_objects = store->ctx()->_conf->rgw_user_default_quota_max_objects; + def_user_quota.enabled = true; + } + if (store->ctx()->_conf->rgw_user_default_quota_max_size >= 0) { + def_user_quota.max_size_kb = store->ctx()->_conf->rgw_user_default_quota_max_size; + def_user_quota.enabled = true; + } + } virtual int check_quota(const string& user, rgw_bucket& bucket, RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t num_objs, uint64_t size) { - if (!bucket_quota.enabled && !user_quota.enabled) + if (!bucket_quota.enabled && !user_quota.enabled && !def_bucket_quota.enabled && !def_user_quota.enabled) return 0; uint64_t size_kb = rgw_rounded_objsize_kb(size); @@ -715,16 +737,28 @@ public: return ret; } - if (user_quota.enabled) { + if (def_bucket_quota.enabled) { + ret = check_quota("def_bucket", def_bucket_quota, bucket_stats, num_objs, size_kb); + if (ret < 0) + return ret; + } + + if (user_quota.enabled || def_user_quota.enabled) { RGWStorageStats user_stats; ret = user_stats_cache.get_stats(user, bucket, user_stats, user_quota); if (ret < 0) return ret; - ret = check_quota("user", user_quota, user_stats, num_objs, size_kb); - if (ret < 0) - return ret; + if (user_quota.enabled) { + ret = check_quota("user", user_quota, user_stats, num_objs, size_kb); + if (ret < 0) + return ret; + } else if (def_user_quota.enabled) { + ret = check_quota("def_user", def_user_quota, user_stats, num_objs, size_kb); + if (ret < 0) + return ret; + } } return 0; diff --git a/ceph/src/rgw/rgw_rados.cc b/ceph/src/rgw/rgw_rados.cc index 05c41ef4..5854c64d 100644 --- a/ceph/src/rgw/rgw_rados.cc +++ b/ceph/src/rgw/rgw_rados.cc @@ -898,7 +898,7 @@ int RGWPutObjProcessor::complete(string& etag, time_t *mtime, time_t set_mtime, if (r < 0) return r; - is_complete = true; + is_complete = !canceled; return 0; } @@ -914,7 +914,7 @@ RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio() if (is_complete) return; - list::iterator iter; + set::iterator iter; bool is_multipart_obj = false; rgw_obj multipart_obj; @@ -926,7 +926,7 @@ RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio() * details is describled on #11749 */ for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) { - rgw_obj &obj = *iter; + const rgw_obj &obj = *iter; if (RGW_OBJ_NS_MULTIPART == obj.ns) { ldout(store->ctx(), 5) << "NOTE: we should not process the multipart object (" << obj << ") here" << dendl; multipart_obj = *iter; @@ -955,7 +955,6 @@ int RGWPutObjProcessor_Aio::handle_obj_data(rgw_obj& obj, bufferlist& bl, off_t obj_len = abs_ofs + bl.length(); if (!(obj == last_written_obj)) { - add_written_obj(obj); last_written_obj = obj; } @@ -965,7 +964,6 @@ int RGWPutObjProcessor_Aio::handle_obj_data(rgw_obj& obj, bufferlist& bl, off_t bl, ((ofs != 0) ? ofs : -1), exclusive, phandle); - return r; } @@ -984,6 +982,11 @@ int RGWPutObjProcessor_Aio::wait_pending_front() } struct put_obj_aio_info info = pop_pending(); int ret = store->aio_wait(info.handle); + + if (ret >= 0) { + add_written_obj(info.obj); + } + return ret; } @@ -1007,11 +1010,12 @@ int RGWPutObjProcessor_Aio::drain_pending() return ret; } -int RGWPutObjProcessor_Aio::throttle_data(void *handle, bool need_to_wait) +int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_obj& obj, bool need_to_wait) { if (handle) { struct put_obj_aio_info info; info.handle = handle; + info.obj = obj; pending.push_back(info); } size_t orig_size = pending.size(); @@ -1042,7 +1046,7 @@ int RGWPutObjProcessor_Aio::throttle_data(void *handle, bool need_to_wait) return 0; } -int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, bool exclusive) +int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_obj *pobj, bool exclusive) { if (ofs >= next_part_ofs) { int r = prepare_next_part(ofs); @@ -1051,10 +1055,12 @@ int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phan } } + *pobj = cur_obj; + return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive); } -int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again) +int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, rgw_obj *pobj, bool *again) { *again = false; @@ -1103,7 +1109,7 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, MD5 *hash, bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there we could be racing with another upload, to the same object and cleanup can be messy */ - int ret = write_data(bl, write_ofs, phandle, exclusive); + int ret = write_data(bl, write_ofs, phandle, pobj, exclusive); if (ret >= 0) { /* we might return, need to clear bl as it was already sent */ if (hash) { hash->Update((const byte *)bl.c_str(), bl.length()); @@ -1183,18 +1189,34 @@ int RGWPutObjProcessor_Atomic::complete_writing_data() first_chunk.claim(pending_data_bl); obj_len = (uint64_t)first_chunk.length(); } - if (pending_data_bl.length()) { + while (pending_data_bl.length()) { void *handle; - int r = write_data(pending_data_bl, data_ofs, &handle, false); + uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs); + if (max_write_size > pending_data_bl.length()) { + max_write_size = pending_data_bl.length(); + } + bufferlist bl; + pending_data_bl.splice(0, max_write_size, &bl); + rgw_obj obj; + int r = write_data(bl, data_ofs, &handle, &obj, false); if (r < 0) { ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl; return r; } - r = throttle_data(handle, false); + data_ofs += bl.length(); + r = throttle_data(handle, obj, false); if (r < 0) { ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl; return r; } + + if (data_ofs >= next_part_ofs) { + r = prepare_next_part(data_ofs); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl; + return r; + } + } } int r = complete_parts(); if (r < 0) { @@ -1241,6 +1263,8 @@ int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t s return r; } + canceled = obj_op.meta.canceled; + return 0; } @@ -1401,6 +1425,10 @@ int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment) } *alignment = ioctx.pool_required_alignment(); + if (*alignment != 0) { + ldout(cct, 20) << "required alignment=" << *alignment << dendl; + } + return 0; } @@ -1426,6 +1454,8 @@ int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size) *max_chunk_size = config_chunk_size - (config_chunk_size % alignment); + ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl; + return 0; } @@ -3454,6 +3484,8 @@ int RGWRados::Object::Write::write_meta(uint64_t size, } } + meta.canceled = false; + /* update quota cache */ store->quota_handler->update_stats(meta.owner, bucket, (orig_exists ? 0 : 1), size, orig_size); @@ -3465,6 +3497,8 @@ done_cancel: ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl; } + meta.canceled = true; + /* we lost in a race. There are a few options: * - existing object was rewritten (ECANCELED) * - non existing object was created (EEXIST) @@ -3658,7 +3692,8 @@ public: do { void *handle; - int ret = processor->handle_data(bl, ofs, NULL, &handle, &again); + rgw_obj obj; + int ret = processor->handle_data(bl, ofs, NULL, &handle, &obj, &again); if (ret < 0) return ret; @@ -3669,7 +3704,7 @@ public: ret = opstate->renew_state(); if (ret < 0) { ldout(processor->ctx(), 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl; - int r = processor->throttle_data(handle, false); + int r = processor->throttle_data(handle, obj, false); if (r < 0) { ldout(processor->ctx(), 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl; } @@ -3680,7 +3715,7 @@ public: need_opstate = false; } - ret = processor->throttle_data(handle, false); + ret = processor->throttle_data(handle, obj, false); if (ret < 0) return ret; } while (again); @@ -3696,6 +3731,10 @@ public: int complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs) { return processor->complete(etag, mtime, set_mtime, attrs); } + + bool is_canceled() { + return processor->is_canceled(); + } }; /* @@ -3777,6 +3816,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, const char *if_match, const char *if_nomatch, AttrsMod attrs_mod, + bool copy_if_newer, map& attrs, RGWObjCategory category, uint64_t olh_epoch, @@ -3792,6 +3832,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, RGWRESTStreamReadRequest *in_stream_req; string tag; map src_attrs; + int i; append_rand_alpha(cct, tag, tag, 32); RGWPutObjProcessor_Atomic processor(obj_ctx, @@ -3837,8 +3878,26 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, string etag; map req_headers; time_t set_mtime; + + RGWObjState *dest_state = NULL; + + time_t dest_mtime; + const time_t *pmod = mod_ptr; + + if (copy_if_newer) { + /* need to get mtime for destination */ + ret = get_obj_state(&obj_ctx, dest_obj, &dest_state, NULL); + if (ret < 0) + return ret; + + if (dest_state->exists) { + dest_mtime = dest_state->mtime; + pmod = &dest_mtime; + } + } + - ret = conn->get_obj(user_id, info, src_obj, true, &cb, &in_stream_req); + ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr, true, &cb, &in_stream_req); if (ret < 0) { goto set_err_state; } @@ -3881,8 +3940,34 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, attrs = src_attrs; } - ret = cb.complete(etag, mtime, set_mtime, attrs); - if (ret < 0) { +#define MAX_COMPLETE_RETRY 100 + for (i = 0; i < MAX_COMPLETE_RETRY; i++) { + ret = cb.complete(etag, mtime, set_mtime, attrs); + if (ret < 0) { + goto set_err_state; + } + if (copy_if_newer && cb.is_canceled()) { + ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl; + obj_ctx.invalidate(dest_obj); /* object was overwritten */ + ret = get_obj_state(&obj_ctx, dest_obj, &dest_state, NULL); + if (ret < 0) { + ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; + goto set_err_state; + } + if (!dest_state->exists || + dest_state->mtime < set_mtime) { + ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + continue; + } else { + ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + } + } + break; + } + + if (i == MAX_COMPLETE_RETRY) { + ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; + ret = -EIO; goto set_err_state; } @@ -3893,7 +3978,12 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, return 0; set_err_state: - int r = opstate.set_state(RGWOpState::OPSTATE_ERROR); + RGWOpState::OpState state = RGWOpState::OPSTATE_ERROR; + if (copy_if_newer && ret == -ERR_NOT_MODIFIED) { + state = RGWOpState::OPSTATE_COMPLETE; + ret = 0; + } + int r = opstate.set_state(state); if (r < 0) { ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl; } @@ -3959,6 +4049,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, const char *if_match, const char *if_nomatch, AttrsMod attrs_mod, + bool copy_if_newer, map& attrs, RGWObjCategory category, uint64_t olh_epoch, @@ -3993,7 +4084,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, if (remote_src || !source_zone.empty()) { return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, info, source_zone, dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr, - unmod_ptr, if_match, if_nomatch, attrs_mod, attrs, category, + unmod_ptr, if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category, olh_epoch, version_id, ptag, petag, err, progress_cb, progress_data); } @@ -4240,12 +4331,13 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx, do { void *handle; + rgw_obj obj; - ret = processor.handle_data(bl, ofs, NULL, &handle, &again); + ret = processor.handle_data(bl, ofs, NULL, &handle, &obj, &again); if (ret < 0) { return ret; } - ret = processor.throttle_data(handle, false); + ret = processor.throttle_data(handle, obj, false); if (ret < 0) return ret; } while (again); @@ -4301,8 +4393,8 @@ bool RGWRados::is_syncing_bucket_meta(rgw_bucket& bucket) int RGWRados::delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_tracker) { librados::IoCtx index_ctx; - string oid; - int r = open_bucket_index(bucket, index_ctx, oid); + map bucket_objs; + int r = open_bucket_index(bucket, index_ctx, bucket_objs); if (r < 0) return r; @@ -4343,6 +4435,11 @@ int RGWRados::delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_track if (r < 0) { return r; } + /* remove bucket index objects*/ + map::const_iterator biter; + for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) { + index_ctx.remove(biter->second); + } } return 0; } @@ -4794,7 +4891,7 @@ int RGWRados::Object::Delete::delete_obj() return 0; } -int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw_obj& obj, +int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, int versioning_status, uint16_t bilog_flags) { RGWRados::Object del_target(this, bucket_info, obj_ctx, obj); @@ -5444,7 +5541,7 @@ int RGWRados::Object::Read::prepare(int64_t *pofs, int64_t *pend) if (conds.mod_ptr) { ldout(cct, 10) << "If-Modified-Since: " << *conds.mod_ptr << " Last-Modified: " << ctime << dendl; - if (ctime < *conds.mod_ptr) { + if (ctime <= *conds.mod_ptr) { return -ERR_NOT_MODIFIED; } } @@ -6470,7 +6567,7 @@ void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag); } -int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, uint64_t olh_epoch) +int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch) { rgw_rados_ref ref; rgw_bucket bucket; @@ -6487,7 +6584,7 @@ int RGWRados::bucket_index_unlink_instance(rgw_obj& obj_instance, const string& } cls_rgw_obj_key key(obj_instance.get_index_key_name(), obj_instance.get_instance()); - ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_epoch, zone_public_config.log_data); + ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_tag, olh_epoch, zone_public_config.log_data); if (ret < 0) { return ret; } @@ -6828,7 +6925,9 @@ int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_i return ret; } - ret = bucket_index_unlink_instance(target_obj, op_tag, olh_epoch); + string olh_tag(state->olh_tag.c_str(), state->olh_tag.length()); + + ret = bucket_index_unlink_instance(target_obj, op_tag, olh_tag, olh_epoch); if (ret < 0) { ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " returned " << ret << dendl; if (ret == -ECANCELED) { @@ -8078,6 +8177,7 @@ int RGWRados::cls_bucket_list(rgw_bucket& bucket, rgw_obj_key& start, const stri map updates; uint32_t count = 0; while (count < num_entries && !candidates.empty()) { + r = 0; // Select the next one int pos = candidates.begin()->second; const string& name = vcurrents[pos]->first; @@ -9013,3 +9113,48 @@ librados::Rados* RGWRados::get_rados_handle() } } +int RGWRados::delete_obj_aio(rgw_obj& obj, rgw_bucket& bucket, + RGWBucketInfo& bucket_info, RGWObjState *astate, + list& handles, bool keep_index_consistent) +{ + rgw_rados_ref ref; + int ret = get_obj_ref(obj, &ref, &bucket); + if (ret < 0) { + lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + if (keep_index_consistent) { + RGWRados::Bucket bop(this, bucket_info.bucket); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj, astate); + + ret = index_op.prepare(CLS_RGW_OP_DEL); + if (ret < 0) { + lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl; + return ret; + } + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + ret = ref.ioctx.aio_operate(ref.oid, c, &op); + if (ret < 0) { + lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl; + return ret; + } + + handles.push_back(c); + + if (keep_index_consistent) { + ret = delete_obj_index(obj); + if (ret < 0) { + lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + diff --git a/ceph/src/rgw/rgw_rados.h b/ceph/src/rgw/rgw_rados.h index 37c7e8a7..93077f8c 100644 --- a/ceph/src/rgw/rgw_rados.h +++ b/ceph/src/rgw/rgw_rados.h @@ -1487,7 +1487,7 @@ public: int complete_atomic_modification(); public: - Object(RGWRados *_store, RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, rgw_obj& _obj) : store(_store), bucket_info(_bucket_info), + Object(RGWRados *_store, RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info), ctx(_ctx), obj(_obj), bs(store), state(NULL), versioning_disabled(false), bs_initialized(false) {} @@ -1570,10 +1570,11 @@ public: const char *if_match; const char *if_nomatch; uint64_t olh_epoch; + bool canceled; MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), remove_objs(NULL), set_mtime(0), category(RGW_OBJ_CATEGORY_MAIN), flags(0), - if_match(NULL), if_nomatch(NULL), olh_epoch(0) {} + if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false) {} } meta; Write(RGWRados::Object *_target) : target(_target) {} @@ -1767,6 +1768,7 @@ public: const char *if_match, const char *if_nomatch, AttrsMod attrs_mod, + bool copy_if_newer, map& attrs, RGWObjCategory category, uint64_t olh_epoch, @@ -1814,6 +1816,7 @@ public: const char *if_match, const char *if_nomatch, AttrsMod attrs_mod, + bool copy_if_newer, map& attrs, RGWObjCategory category, uint64_t olh_epoch, @@ -1857,7 +1860,7 @@ public: int bucket_suspended(rgw_bucket& bucket, bool *suspended); /** Delete an object.*/ - virtual int delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_owner, rgw_obj& src_obj, + virtual int delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_owner, const rgw_obj& src_obj, int versioning_status, uint16_t bilog_flags = 0); /* Delete a system object */ @@ -1945,7 +1948,7 @@ public: int bucket_index_link_olh(RGWObjState& olh_state, rgw_obj& obj_instance, bool delete_marker, const string& op_tag, struct rgw_bucket_dir_entry_meta *meta, uint64_t olh_epoch); - int bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, uint64_t olh_epoch); + int bucket_index_unlink_instance(rgw_obj& obj_instance, const string& op_tag, const string& olh_tag, uint64_t olh_epoch); int bucket_index_read_olh_log(RGWObjState& state, rgw_obj& obj_instance, uint64_t ver_marker, map > *log, bool *is_truncated); int bucket_index_trim_olh_log(RGWObjState& obj_state, rgw_obj& obj_instance, uint64_t ver); @@ -2159,6 +2162,8 @@ public: librados::Rados* get_rados_handle(); + int delete_obj_aio(rgw_obj& obj, rgw_bucket& bucket, RGWBucketInfo& info, RGWObjState *astate, + list& handles, bool keep_index_consistent); private: /** * This is a helper method, it generates a list of bucket index objects with the given @@ -2305,20 +2310,21 @@ protected: RGWObjectCtx& obj_ctx; bool is_complete; RGWBucketInfo bucket_info; + bool canceled; virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs, const char *if_match = NULL, const char *if_nomatch = NULL) = 0; public: - RGWPutObjProcessor(RGWObjectCtx& _obj_ctx, RGWBucketInfo& _bi) : store(NULL), obj_ctx(_obj_ctx), is_complete(false), bucket_info(_bi) {} + RGWPutObjProcessor(RGWObjectCtx& _obj_ctx, RGWBucketInfo& _bi) : store(NULL), obj_ctx(_obj_ctx), is_complete(false), bucket_info(_bi), canceled(false) {} virtual ~RGWPutObjProcessor() {} virtual int prepare(RGWRados *_store, string *oid_rand) { store = _store; return 0; } - virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again) = 0; - virtual int throttle_data(void *handle, bool need_to_wait) = 0; + virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, rgw_obj *pobj, bool *again) = 0; + virtual int throttle_data(void *handle, const rgw_obj& obj, bool need_to_wait) = 0; virtual void complete_hash(MD5 *hash) { assert(0); } @@ -2327,10 +2333,13 @@ public: const char *if_match = NULL, const char *if_nomatch = NULL); CephContext *ctx(); + + bool is_canceled() { return canceled; } }; struct put_obj_aio_info { void *handle; + rgw_obj obj; }; class RGWPutObjProcessor_Aio : public RGWPutObjProcessor @@ -2347,17 +2356,17 @@ class RGWPutObjProcessor_Aio : public RGWPutObjProcessor protected: uint64_t obj_len; - list written_objs; + set written_objs; void add_written_obj(const rgw_obj& obj) { - written_objs.push_back(obj); + written_objs.insert(obj); } int drain_pending(); int handle_obj_data(rgw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive); public: - int throttle_data(void *handle, bool need_to_wait); + int throttle_data(void *handle, const rgw_obj& obj, bool need_to_wait); RGWPutObjProcessor_Aio(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info) : RGWPutObjProcessor(obj_ctx, bucket_info), max_chunks(RGW_MAX_PENDING_CHUNKS), obj_len(0) {} virtual ~RGWPutObjProcessor_Aio(); @@ -2392,7 +2401,7 @@ protected: RGWObjManifest manifest; RGWObjManifest::generator manifest_gen; - int write_data(bufferlist& bl, off_t ofs, void **phandle, bool exclusive); + int write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_obj *pobj, bool exclusive); virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs, const char *if_match = NULL, const char *if_nomatch = NULL); @@ -2425,7 +2434,7 @@ public: void set_extra_data_len(uint64_t len) { extra_data_len = len; } - virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, bool *again); + virtual int handle_data(bufferlist& bl, off_t ofs, MD5 *hash, void **phandle, rgw_obj *pobj, bool *again); virtual void complete_hash(MD5 *hash); bufferlist& get_extra_data() { return extra_data_bl; } diff --git a/ceph/src/rgw/rgw_rest.cc b/ceph/src/rgw/rgw_rest.cc index 859e34a7..24fd40f2 100644 --- a/ceph/src/rgw/rgw_rest.cc +++ b/ceph/src/rgw/rgw_rest.cc @@ -246,7 +246,7 @@ static bool rgw_find_host_in_domains(const string& host, string *domain, string return false; } -static void dump_status(struct req_state *s, const char *status, const char *status_name) +static void dump_status(struct req_state *s, int status, const char *status_name) { int r = s->cio->send_status(status, status_name); if (r < 0) { @@ -305,16 +305,12 @@ void set_req_state_err(struct req_state *s, int err_no) void dump_errno(struct req_state *s) { - char buf[32]; - snprintf(buf, sizeof(buf), "%d", s->err.http_ret); - dump_status(s, buf, http_status_names[s->err.http_ret]); + dump_status(s, s->err.http_ret, http_status_names[s->err.http_ret]); } -void dump_errno(struct req_state *s, int err) +void dump_errno(struct req_state *s, int http_ret) { - char buf[32]; - snprintf(buf, sizeof(buf), "%d", err); - dump_status(s, buf, http_status_names[s->err.http_ret]); + dump_status(s, http_ret, http_status_names[http_ret]); } void dump_string_header(struct req_state *s, const char *name, const char *val) @@ -459,6 +455,15 @@ void dump_access_control(struct req_state *s, const char *origin, const char *me const char *hdr, const char *exp_hdr, uint32_t max_age) { if (origin && (origin[0] != '\0')) { s->cio->print("Access-Control-Allow-Origin: %s\r\n", origin); + + /* If the server specifies an origin host rather than "*", + * then it must also include Origin in the Vary response header + * to indicate to clients that server responses will differ + * based on the value of the Origin request header. + */ + if (strcmp(origin, "*") != 0) + s->cio->print("Vary: Origin\r\n"); + if (meth && (meth[0] != '\0')) s->cio->print("Access-Control-Allow-Methods: %s\r\n", meth); if (hdr && (hdr[0] != '\0')) @@ -1228,7 +1233,7 @@ int RGWHandler_ObjStore::read_permissions(RGWOp *op_obj) case OP_POST: case OP_COPY: /* is it a 'multi-object delete' request? */ - if (s->info.request_params == "delete") { + if (s->info.args.exists("delete")) { only_bucket = true; break; } diff --git a/ceph/src/rgw/rgw_rest.h b/ceph/src/rgw/rgw_rest.h index 02ae790e..69ce33d5 100644 --- a/ceph/src/rgw/rgw_rest.h +++ b/ceph/src/rgw/rgw_rest.h @@ -53,9 +53,14 @@ int rgw_rest_get_json_input(CephContext *cct, req_state *s, T& out, int max_len, return -EINVAL; } - decode_json_obj(out, &parser); - free(data); + + try { + decode_json_obj(out, &parser); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } + return 0; } diff --git a/ceph/src/rgw/rgw_rest_conn.cc b/ceph/src/rgw/rgw_rest_conn.cc index cbffad0b..0e31a0f8 100644 --- a/ceph/src/rgw/rgw_rest_conn.cc +++ b/ceph/src/rgw/rgw_rest_conn.cc @@ -78,8 +78,21 @@ int RGWRESTConn::complete_request(RGWRESTStreamWriteRequest *req, string& etag, return ret; } -int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata, - RGWGetDataCB *cb, RGWRESTStreamReadRequest **req) +static void set_date_header(const time_t *t, map& headers, const string& header_name) +{ + if (!t) { + return; + } + stringstream s; + utime_t tm = utime_t(*t, 0); + tm.asctime(s); + headers["HTTP_IF_MODIFIED_SINCE"] = s.str(); +} + + +int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, + const time_t *mod_ptr, const time_t *unmod_ptr, + bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req) { string url; int ret = get_url(url); @@ -108,6 +121,10 @@ int RGWRESTConn::get_obj(const string& uid, req_info *info /* optional */, rgw_o extra_headers[iter->first] = iter->second; } } + + set_date_header(mod_ptr, extra_headers, "HTTP_IF_MODIFIED_SINCE"); + set_date_header(unmod_ptr, extra_headers, "HTTP_IF_UNMODIFIED_SINCE"); + return (*req)->get_obj(key, extra_headers, obj); } diff --git a/ceph/src/rgw/rgw_rest_conn.h b/ceph/src/rgw/rgw_rest_conn.h index 209ddcf9..b39e570d 100644 --- a/ceph/src/rgw/rgw_rest_conn.h +++ b/ceph/src/rgw/rgw_rest_conn.h @@ -30,7 +30,9 @@ public: map& attrs, RGWRESTStreamWriteRequest **req); int complete_request(RGWRESTStreamWriteRequest *req, string& etag, time_t *mtime); - int get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req); + int get_obj(const string& uid, req_info *info /* optional */, rgw_obj& obj, + const time_t *mod_ptr, const time_t *unmod_ptr, + bool prepend_metadata, RGWGetDataCB *cb, RGWRESTStreamReadRequest **req); int complete_request(RGWRESTStreamReadRequest *req, string& etag, time_t *mtime, map& attrs); }; diff --git a/ceph/src/rgw/rgw_rest_s3.cc b/ceph/src/rgw/rgw_rest_s3.cc index 0cc7793a..81394dd9 100644 --- a/ceph/src/rgw/rgw_rest_s3.cc +++ b/ceph/src/rgw/rgw_rest_s3.cc @@ -167,10 +167,14 @@ done: s->cio->print("%s: %s\r\n", riter->first.c_str(), riter->second.c_str()); } - if (!content_type) - content_type = "binary/octet-stream"; + if (ret == -ERR_NOT_MODIFIED) { + end_header(s, this); + } else { + if (!content_type) + content_type = "binary/octet-stream"; - end_header(s, this, content_type); + end_header(s, this, content_type); + } if (metadata_bl.length()) { s->cio->write(metadata_bl.c_str(), metadata_bl.length()); @@ -263,8 +267,10 @@ void RGWListBucket_ObjStore_S3::send_versioned_response() s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false")); bool encode_key = false; - if (strcasecmp(encoding_type.c_str(), "url") == 0) + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); encode_key = true; + } if (ret >= 0) { vector::iterator iter; @@ -340,8 +346,10 @@ void RGWListBucket_ObjStore_S3::send_response() s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false")); bool encode_key = false; - if (strcasecmp(encoding_type.c_str(), "url") == 0) + if (strcasecmp(encoding_type.c_str(), "url") == 0) { + s->formatter->dump_string("EncodingType", "url"); encode_key = true; + } if (ret >= 0) { vector::iterator iter; @@ -1463,6 +1471,7 @@ int RGWCopyObj_ObjStore_S3::get_params() if (s->system_request) { source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone"); + s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "copy-if-newer", ©_if_newer, false); if (!source_zone.empty()) { client_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "client-id"); op_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "op-id"); @@ -1804,7 +1813,7 @@ void RGWListMultipart_ObjStore_S3::send_response() } s->formatter->dump_unsigned("PartNumber", info.num); - s->formatter->dump_string("ETag", info.etag); + s->formatter->dump_format("ETag", "\"%s\"", info.etag.c_str()); s->formatter->dump_unsigned("Size", info.size); s->formatter->close_section(); } @@ -2012,7 +2021,7 @@ RGWOp *RGWHandler_ObjStore_Bucket_S3::op_delete() RGWOp *RGWHandler_ObjStore_Bucket_S3::op_post() { - if ( s->info.request_params == "delete" ) { + if (s->info.args.exists("delete")) { return new RGWDeleteMultiObj_ObjStore_S3; } diff --git a/ceph/src/rgw/rgw_user.cc b/ceph/src/rgw/rgw_user.cc index 1e122df0..2947fd62 100644 --- a/ceph/src/rgw/rgw_user.cc +++ b/ceph/src/rgw/rgw_user.cc @@ -1120,6 +1120,63 @@ int RGWAccessKeyPool::remove(RGWUserAdminOpState& op_state, std::string *err_msg return 0; } +// remove all keys associated with a subuser +int RGWAccessKeyPool::remove_subuser_keys(RGWUserAdminOpState& op_state, + std::string *err_msg, bool defer_user_update) +{ + int ret = 0; + + if (!op_state.is_populated()) { + set_err_msg(err_msg, "user info was not populated"); + return -EINVAL; + } + + if (!op_state.has_subuser()) { + set_err_msg(err_msg, "no subuser specified"); + return -EINVAL; + } + + std::string swift_kid = op_state.build_default_swift_kid(); + if (swift_kid.empty()) { + set_err_msg(err_msg, "empty swift access key"); + return -EINVAL; + } + + map::iterator kiter; + map *keys_map; + + // a subuser can have at most one swift key + keys_map = swift_keys; + kiter = keys_map->find(swift_kid); + if (kiter != keys_map->end()) { + rgw_remove_key_index(store, kiter->second); + keys_map->erase(kiter); + } + + // a subuser may have multiple s3 key pairs + std::string subuser_str = op_state.get_subuser(); + keys_map = access_keys; + RGWUserInfo user_info = op_state.get_user_info(); + map::iterator user_kiter = user_info.access_keys.begin(); + for (; user_kiter != user_info.access_keys.end(); ++user_kiter) { + if (user_kiter->second.subuser == subuser_str) { + kiter = keys_map->find(user_kiter->first); + if (kiter != keys_map->end()) { + rgw_remove_key_index(store, kiter->second); + keys_map->erase(kiter); + } + } + } + + if (!defer_user_update) + ret = user->update(op_state, err_msg); + + if (ret < 0) + return ret; + + return 0; +} + RGWSubUserPool::RGWSubUserPool(RGWUser *usr) { subusers_allowed = (usr != NULL); @@ -1284,18 +1341,19 @@ int RGWSubUserPool::execute_remove(RGWUserAdminOpState& op_state, map::iterator siter; siter = subuser_map->find(subuser_str); - + if (siter == subuser_map->end()){ + set_err_msg(err_msg, "subuser not found: " + subuser_str); + return -EINVAL; + } if (!op_state.has_existing_subuser()) { set_err_msg(err_msg, "subuser not found: " + subuser_str); return -EINVAL; } - if (op_state.will_purge_keys()) { - // error would be non-existance so don't check - user->keys.remove(op_state, &subprocess_msg, true); - } + // always purge all associate keys + user->keys.remove_subuser_keys(op_state, &subprocess_msg, true); - //remove the subuser from the user info + // remove the subuser from the user info subuser_map->erase(siter); // attempt to save the subuser @@ -2428,7 +2486,11 @@ public: time_t mtime, JSONObj *obj, sync_type_t sync_mode) { RGWUserInfo info; - decode_json_obj(info, obj); + try { + decode_json_obj(info, obj); + } catch (JSONDecoder::err& e) { + return -EINVAL; + } RGWUserInfo old_info; time_t orig_mtime; diff --git a/ceph/src/rgw/rgw_user.h b/ceph/src/rgw/rgw_user.h index 6204b096..d0f43b96 100644 --- a/ceph/src/rgw/rgw_user.h +++ b/ceph/src/rgw/rgw_user.h @@ -479,6 +479,7 @@ private: /* API Contract Fulfilment */ int execute_add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); int execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); + int remove_subuser_keys(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); int add(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); int remove(RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save); diff --git a/ceph/src/rocksdb/configure.ac b/ceph/src/rocksdb/configure.ac index 855c6a82..62bae252 100644 --- a/ceph/src/rocksdb/configure.ac +++ b/ceph/src/rocksdb/configure.ac @@ -19,7 +19,13 @@ AC_CHECK_LIB([snappy], [snappy_compress], [HAVE_LIBSNAPPY=yes], [AC_MSG_FAILURE( AC_CHECK_LIB([z], [gzread], [HAVE_LIBZ=yes], [AC_MSG_FAILURE([libz not found])]) AC_CHECK_LIB([bz2], [BZ2_bzCompressInit], [HAVE_LIBBZ2=yes], [AC_MSG_FAILURE([libbz2 not found])]) AC_CHECK_LIB([rt], [clock_gettime], [HAVE_LIBRT=yes], [AC_MSG_FAILURE([librt not found])]) -AC_CHECK_LIB([tcmalloc], [malloc], [HAVE_LIBTCMALLOC=yes],[AC_MSG_FAILURE([no tcmalloc found ])]) +AC_ARG_WITH([tcmalloc], + [AS_HELP_STRING([--without-tcmalloc], [disable tcmalloc for memory allocations])], + [], + [AC_CHECK_LIB([tcmalloc], + [malloc], + [HAVE_LIBTCMALLOC=yes], + [AC_MSG_FAILURE([no tcmalloc found ])])]) OLD_CXXFLAGS="$CXXFLAGS" CXXFLAGS="$CXXFLAGS -std=c++11" diff --git a/ceph/src/test/Makefile.am b/ceph/src/test/Makefile.am index ab77ba7a..4aa9b502 100644 --- a/ceph/src/test/Makefile.am +++ b/ceph/src/test/Makefile.am @@ -76,6 +76,7 @@ check_SCRIPTS += \ test/mon/osd-erasure-code-profile.sh \ test/mon/mkfs.sh \ test/osd/osd-scrub-repair.sh \ + test/osd/osd-scrub-snaps.sh \ test/osd/osd-config.sh \ test/osd/osd-bench.sh \ test/osd/osd-copy-from.sh \ diff --git a/ceph/src/test/bufferlist.cc b/ceph/src/test/bufferlist.cc index 3c8d0473..6ea987b0 100644 --- a/ceph/src/test/bufferlist.cc +++ b/ceph/src/test/bufferlist.cc @@ -1403,12 +1403,15 @@ TEST(BufferList, rebuild) { { bufferlist bl; bufferptr ptr(buffer::create_page_aligned(2)); + ptr[0] = 'X'; + ptr[1] = 'Y'; ptr.set_offset(1); ptr.set_length(1); bl.append(ptr); EXPECT_FALSE(bl.is_page_aligned()); bl.rebuild(); - EXPECT_FALSE(bl.is_page_aligned()); + EXPECT_EQ(1U, bl.length()); + EXPECT_EQ('Y', *bl.begin()); } { bufferlist bl; diff --git a/ceph/src/test/centos-6/ceph.spec.in b/ceph/src/test/centos-6/ceph.spec.in index 3f9a126c..befd9114 100644 --- a/ceph/src/test/centos-6/ceph.spec.in +++ b/ceph/src/test/centos-6/ceph.spec.in @@ -1,6 +1,11 @@ %bcond_with ocf %bcond_without cephfs_java +# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12 +%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315 +%bcond_without lttng +%endif + %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600)) %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")} %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")} @@ -8,11 +13,6 @@ %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} -# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12 -%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315 -%global _with_lttng 1 -%endif - Name: ceph Version: @VERSION@ Release: @RPM_RELEASE@%{?dist} @@ -103,7 +103,10 @@ BuildRequires: %insserv_prereq BuildRequires: mozilla-nss-devel BuildRequires: keyutils-devel BuildRequires: libatomic-ops-devel -%else +Requires: lsb-release +BuildRequires: lsb-release +%endif +%if 0%{?fedora} || 0%{?rhel} Requires: gdisk BuildRequires: nss-devel BuildRequires: keyutils-libs-devel @@ -114,9 +117,11 @@ Requires(preun):chkconfig Requires(preun):initscripts BuildRequires: gperftools-devel Requires: python-flask +Requires: redhat-lsb-core +BuildRequires: redhat-lsb-core %endif # lttng and babeltrace for rbd-replay-prep -%if 0%{?_with_lttng} +%if %{with lttng} %if 0%{?fedora} || 0%{?rhel} BuildRequires: lttng-ust-devel BuildRequires: libbabeltrace-devel @@ -461,7 +466,7 @@ done %endif ./autogen.sh -MY_CONF_OPT="" +MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS" MY_CONF_OPT="$MY_CONF_OPT --with-radosgw" @@ -481,6 +486,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` %endif --with-librocksdb-static=check \ $MY_CONF_OPT \ +%if %{without lttng} + --without-lttng \ + --without-babeltrace \ +%endif %{?_with_ocf} \ CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS" @@ -627,7 +636,7 @@ fi %{_libdir}/rados-classes/libcls_version.so* %dir %{_libdir}/ceph/erasure-code %{_libdir}/ceph/erasure-code/libec_*.so* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/libos_tp.so* %{_libdir}/libosd_tp.so* %endif @@ -680,7 +689,7 @@ fi %{_bindir}/rbd %{_bindir}/rbd-replay %{_bindir}/rbd-replay-many -%if 0%{?_with_lttng} +%if %{with lttng} %{_bindir}/rbd-replay-prep %endif %{_bindir}/ceph-post-file @@ -786,7 +795,7 @@ fi %files -n librados2 %defattr(-,root,root,-) %{_libdir}/librados.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so.* %endif @@ -809,7 +818,7 @@ fi %{_includedir}/rados/rados_types.hpp %{_includedir}/rados/memory.h %{_libdir}/librados.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so %endif @@ -841,7 +850,7 @@ fi %files -n librbd1 %defattr(-,root,root,-) %{_libdir}/librbd.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so.* %endif @@ -861,7 +870,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_includedir}/rbd/librbd.hpp %{_includedir}/rbd/features.h %{_libdir}/librbd.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so %endif diff --git a/ceph/src/test/centos-7/ceph.spec.in b/ceph/src/test/centos-7/ceph.spec.in index 3f9a126c..befd9114 100644 --- a/ceph/src/test/centos-7/ceph.spec.in +++ b/ceph/src/test/centos-7/ceph.spec.in @@ -1,6 +1,11 @@ %bcond_with ocf %bcond_without cephfs_java +# LTTng-UST enabled on Fedora, RHEL 6+, and SLES 12 +%if 0%{?fedora} || 0%{?rhel} >= 6 || 0%{?suse_version} == 1315 +%bcond_without lttng +%endif + %if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600)) %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")} %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")} @@ -8,11 +13,6 @@ %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} -# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12 -%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315 -%global _with_lttng 1 -%endif - Name: ceph Version: @VERSION@ Release: @RPM_RELEASE@%{?dist} @@ -103,7 +103,10 @@ BuildRequires: %insserv_prereq BuildRequires: mozilla-nss-devel BuildRequires: keyutils-devel BuildRequires: libatomic-ops-devel -%else +Requires: lsb-release +BuildRequires: lsb-release +%endif +%if 0%{?fedora} || 0%{?rhel} Requires: gdisk BuildRequires: nss-devel BuildRequires: keyutils-libs-devel @@ -114,9 +117,11 @@ Requires(preun):chkconfig Requires(preun):initscripts BuildRequires: gperftools-devel Requires: python-flask +Requires: redhat-lsb-core +BuildRequires: redhat-lsb-core %endif # lttng and babeltrace for rbd-replay-prep -%if 0%{?_with_lttng} +%if %{with lttng} %if 0%{?fedora} || 0%{?rhel} BuildRequires: lttng-ust-devel BuildRequires: libbabeltrace-devel @@ -461,7 +466,7 @@ done %endif ./autogen.sh -MY_CONF_OPT="" +MY_CONF_OPT="$CEPH_EXTRA_CONFIGURE_ARGS" MY_CONF_OPT="$MY_CONF_OPT --with-radosgw" @@ -481,6 +486,10 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` %endif --with-librocksdb-static=check \ $MY_CONF_OPT \ +%if %{without lttng} + --without-lttng \ + --without-babeltrace \ +%endif %{?_with_ocf} \ CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS" @@ -627,7 +636,7 @@ fi %{_libdir}/rados-classes/libcls_version.so* %dir %{_libdir}/ceph/erasure-code %{_libdir}/ceph/erasure-code/libec_*.so* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/libos_tp.so* %{_libdir}/libosd_tp.so* %endif @@ -680,7 +689,7 @@ fi %{_bindir}/rbd %{_bindir}/rbd-replay %{_bindir}/rbd-replay-many -%if 0%{?_with_lttng} +%if %{with lttng} %{_bindir}/rbd-replay-prep %endif %{_bindir}/ceph-post-file @@ -786,7 +795,7 @@ fi %files -n librados2 %defattr(-,root,root,-) %{_libdir}/librados.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so.* %endif @@ -809,7 +818,7 @@ fi %{_includedir}/rados/rados_types.hpp %{_includedir}/rados/memory.h %{_libdir}/librados.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librados_tp.so %endif @@ -841,7 +850,7 @@ fi %files -n librbd1 %defattr(-,root,root,-) %{_libdir}/librbd.so.* -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so.* %endif @@ -861,7 +870,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_includedir}/rbd/librbd.hpp %{_includedir}/rbd/features.h %{_libdir}/librbd.so -%if 0%{?_with_lttng} +%if %{with lttng} %{_libdir}/librbd_tp.so %endif diff --git a/ceph/src/test/ceph_objectstore_tool.py b/ceph/src/test/ceph_objectstore_tool.py index 52ae51ce..93a44388 100755 --- a/ceph/src/test/ceph_objectstore_tool.py +++ b/ceph/src/test/ceph_objectstore_tool.py @@ -8,7 +8,7 @@ except ImportError: import subprocess # backported from python 2.7 stdlib process = subprocess.Popen( - stdout=subprocess.PIPE, *popenargs, **kwargs) + stdout=subprocess.PIPE, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll() if retcode: @@ -20,21 +20,32 @@ except ImportError: raise error return output -import subprocess +import filecmp import os +import subprocess +try: + from subprocess import DEVNULL +except ImportError: + subprocess.DEVNULL = open(os.devnull, "w") + +import math import time import sys import re import string import logging import json +import tempfile logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) def wait_for_health(): print "Wait for health_ok...", + tries = 0 while call("./ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null", shell=True) == 0: + if ++tries == 30: + raise Exception("Time exceeded to go to health") time.sleep(5) print "DONE" @@ -51,7 +62,7 @@ def get_osd_pgs(SUBDIR, ID): if ID: endhead = re.compile("{id}.*_head$".format(id=ID)) DIR = os.path.join(SUBDIR, "current") - PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID == None or endhead.match(f))] + PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))] PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p] return PGS @@ -79,7 +90,7 @@ def get_objs(ALLPGS, prefix, DIR, ID): continue FINALDIR = os.path.join(SUBDIR, PGDIR) # See if there are any objects there - if [ f for f in [ val for _, _, fl in os.walk(FINALDIR) for val in fl ] if string.find(f, prefix) == 0 ]: + if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)): PGS += [p] return sorted(set(PGS)) @@ -124,51 +135,44 @@ def cat_file(level, filename): print "" -def vstart(new): +def vstart(new, opt=""): print "vstarting....", - OPT = new and "-n" or "" - call("MON=1 OSD=4 CEPH_PORT=7400 ./vstart.sh -l {opt} -d mon osd > /dev/null 2>&1".format(opt=OPT), shell=True) + NEW = new and "-n" or "" + call("MON=1 OSD=4 CEPH_PORT=7400 ./vstart.sh -l {new} -d mon osd {opt} > /dev/null 2>&1".format(new=NEW, opt=opt), shell=True) print "DONE" -def test_failure_tty(cmd, errmsg): - try: - ttyfd = open("/dev/tty", "rw") - except Exception, e: - logging.info(str(e)) - logging.info("SKIP " + cmd) - return 0 + +def test_failure(cmd, errmsg, tty=False): + if tty: + try: + ttyfd = open("/dev/tty", "rw") + except Exception, e: + logging.info(str(e)) + logging.info("SKIP " + cmd) + return 0 TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid()) tmpfd = open(TMPFILE, "w") logging.debug(cmd) - ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd) - ttyfd.close() + if tty: + ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd) + ttyfd.close() + else: + ret = call(cmd, shell=True, stderr=tmpfd) tmpfd.close() if ret == 0: + logging.error(cmd) logging.error("Should have failed, but got exit 0") return 1 lines = get_lines(TMPFILE) - line = lines[0] - if line == errmsg: - logging.info("Correctly failed with message \"" + line + "\"") + matched = [ l for l in lines if errmsg in l ] + if any(matched): + logging.info("Correctly failed with message \"" + matched[0] + "\"") return 0 else: - logging.error("Bad message to stderr \"" + line + "\"") + logging.error("Bad messages to stderr \"" + str(lines) + "\"") return 1 -def test_failure(cmd, errmsg): - logging.debug(cmd) - try: - out = check_output(cmd, stderr=subprocess.STDOUT, shell=True) - logging.error("Should have failed, but got exit 0") - return 1 - except subprocess.CalledProcessError, e: - if errmsg in e.output: - logging.info("Correctly failed with message \"" + errmsg + "\"") - return 0 - else: - logging.error("Bad message to stderr \"" + e.output + "\"") - return 1 def get_nspace(num): if num == 0: @@ -176,14 +180,19 @@ def get_nspace(num): return "ns{num}".format(num=num) -def verify(DATADIR, POOL, NAME_PREFIX): +def verify(DATADIR, POOL, NAME_PREFIX, db): TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid()) nullfd = open(os.devnull, "w") ERRORS = 0 - for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]: + for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]: + nsfile = rawnsfile.split("__")[0] + clone = rawnsfile.split("__")[1] nspace = nsfile.split("-")[0] file = nsfile.split("-")[1] - path = os.path.join(DATADIR, nsfile) + # Skip clones + if clone != "head": + continue + path = os.path.join(DATADIR, rawnsfile) try: os.unlink(TMPFILE) except: @@ -201,20 +210,361 @@ def verify(DATADIR, POOL, NAME_PREFIX): os.unlink(TMPFILE) except: pass + for key, val in db[nspace][file]["xattr"].iteritems(): + cmd = "./rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace) + logging.debug(cmd) + getval = check_output(cmd, shell=True, stderr=nullfd) + logging.debug("getxattr {key} {val}".format(key=key, val=getval)) + if getval != val: + logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val)) + ERRORS += 1 + continue + hdr = db[nspace][file].get("omapheader", "") + cmd = "./rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE) + logging.debug(cmd) + ret = call(cmd, shell=True, stderr=nullfd) + if ret != 0: + logging.error("rados getomapheader returned {ret}".format(ret=ret)) + ERRORS += 1 + else: + getlines = get_lines(TMPFILE) + assert(len(getlines) == 0 or len(getlines) == 1) + if len(getlines) == 0: + gethdr = "" + else: + gethdr = getlines[0] + logging.debug("header: {hdr}".format(hdr=gethdr)) + if gethdr != hdr: + logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr)) + ERRORS += 1 + for key, val in db[nspace][file]["omap"].iteritems(): + cmd = "./rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE) + logging.debug(cmd) + ret = call(cmd, shell=True, stderr=nullfd) + if ret != 0: + logging.error("getomapval returned {ret}".format(ret=ret)) + ERRORS += 1 + continue + getlines = get_lines(TMPFILE) + if len(getlines) != 1: + logging.error("Bad data from getomapval {lines}".format(lines=getlines)) + ERRORS += 1 + continue + getval = getlines[0] + logging.debug("getomapval {key} {val}".format(key=key, val=getval)) + if getval != val: + logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val)) + ERRORS += 1 + try: + os.unlink(TMPFILE) + except: + pass + return ERRORS + + +def check_journal(jsondict): + errors = 0 + if 'header' not in jsondict: + logging.error("Key 'header' not in dump-journal") + errors += 1 + elif 'max_size' not in jsondict['header']: + logging.error("Key 'max_size' not in dump-journal header") + errors += 1 + else: + print "\tJournal max_size = {size}".format(size=jsondict['header']['max_size']) + if 'entries' not in jsondict: + logging.error("Key 'entries' not in dump-journal output") + errors += 1 + elif len(jsondict['entries']) == 0: + logging.info("No entries in journal found") + else: + errors += check_journal_entries(jsondict['entries']) + return errors + + +def check_journal_entries(entries): + errors = 0 + for enum in range(len(entries)): + if 'offset' not in entries[enum]: + logging.error("No 'offset' key in entry {e}".format(e=enum)) + errors += 1 + if 'seq' not in entries[enum]: + logging.error("No 'seq' key in entry {e}".format(e=enum)) + errors += 1 + if 'transactions' not in entries[enum]: + logging.error("No 'transactions' key in entry {e}".format(e=enum)) + errors += 1 + elif len(entries[enum]['transactions']) == 0: + logging.error("No transactions found in entry {e}".format(e=enum)) + errors += 1 + else: + errors += check_entry_transactions(entries[enum], enum) + return errors + + +def check_entry_transactions(entry, enum): + errors = 0 + for tnum in range(len(entry['transactions'])): + if 'trans_num' not in entry['transactions'][tnum]: + logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum)) + errors += 1 + elif entry['transactions'][tnum]['trans_num'] != tnum: + ft = entry['transactions'][tnum]['trans_num'] + logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum)) + errors += 1 + if 'ops' not in entry['transactions'][tnum]: + logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum)) + errors += 1 + else: + errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum) + return errors + + +def check_transaction_ops(ops, enum, tnum): + if len(ops) is 0: + logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum)) + errors = 0 + for onum in range(len(ops)): + if 'op_num' not in ops[onum]: + logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum)) + errors += 1 + elif ops[onum]['op_num'] != onum: + fo = ops[onum]['op_num'] + logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum)) + errors += 1 + if 'op_name' not in ops[onum]: + logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum)) + errors += 1 + return errors + + +def test_dump_journal(CFSD_PREFIX, osds): + ERRORS = 0 + pid = os.getpid() + TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid) + + for osd in osds: + # Test --op dump-journal by loading json + cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd) + logging.debug(cmd) + tmpfd = open(TMPFILE, "w") + ret = call(cmd, shell=True, stdout=tmpfd) + if ret != 0: + logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd)) + ERRORS += 1 + continue + tmpfd.close() + tmpfd = open(TMPFILE, "r") + jsondict = json.load(tmpfd) + tmpfd.close() + os.unlink(TMPFILE) + + journal_errors = check_journal(jsondict) + if journal_errors is not 0: + logging.error(jsondict) + ERRORS += journal_errors + return ERRORS + CEPH_DIR = "ceph_objectstore_tool_dir" CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf') + def kill_daemons(): call("./init-ceph -c {conf} stop osd mon > /dev/null 2>&1".format(conf=CEPH_CONF), shell=True) + +def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME): + repcount = 0 + ERRORS = 0 + for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]: + nsfile = rawnsfile.split("__")[0] + clone = rawnsfile.split("__")[1] + nspace = nsfile.split("-")[0] + file = nsfile.split("-")[1] + "__" + clone + # Skip clones + if clone != "head": + continue + path = os.path.join(DATADIR, rawnsfile) + tmpfd = open(TMPFILE, "w") + cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=tmpfd) + if ret: + logging.critical("INTERNAL ERROR") + return 1 + tmpfd.close() + obj_locs = get_lines(TMPFILE) + if len(obj_locs) == 0: + logging.error("Can't find imported object {name}".format(name=file)) + ERRORS += 1 + for obj_loc in obj_locs: + repcount += 1 + cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc)) + ERRORS += 1 + return ERRORS, repcount + + +def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight): + # change the weight of osd.0 to math.pi in the newest osdmap of given osd + osdmap_file = tempfile.NamedTemporaryFile() + cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path, + osdmap_file=osdmap_file.name) + output = check_output(cmd, shell=True) + epoch = int(re.findall('#(\d+)', output)[0]) + + new_crush_file = tempfile.NamedTemporaryFile(delete=False) + old_crush_file = tempfile.NamedTemporaryFile(delete=False) + ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name, + crush_file=old_crush_file.name), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + shell=True) + assert(ret == 0) + + for osd_id in osd_ids: + cmd = "./crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id, + crush_file=old_crush_file.name, + weight=weight, + new_crush_file=new_crush_file.name) + ret = call(cmd, stdout=subprocess.DEVNULL, shell=True) + assert(ret == 0) + old_crush_file, new_crush_file = new_crush_file, old_crush_file + + # change them back, since we don't need to preapre for another round + old_crush_file, new_crush_file = new_crush_file, old_crush_file + old_crush_file.close() + + ret = call("./osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name, + crush_file=new_crush_file.name), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + shell=True) + assert(ret == 0) + + # Minimum test of --dry-run by using it, but not checking anything + cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run" + cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch) + ret = call(cmd, stdout=subprocess.DEVNULL, shell=True) + assert(ret == 0) + + # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool + # to use use a different epoch than the one in osdmap + cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force" + cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch) + ret = call(cmd, stdout=subprocess.DEVNULL, shell=True) + return ret == 0 + +def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path): + osdmap_file = tempfile.NamedTemporaryFile() + cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path, + osdmap_file=osdmap_file.name) + ret = call(cmd, stdout=subprocess.DEVNULL, shell=True) + if ret != 0: + return None + # we have to read the weights from the crush map, even we can query the weights using + # osdmaptool, but please keep in mind, they are different: + # item weights in crush map versus weight associated with each osd in osdmap + crush_file = tempfile.NamedTemporaryFile(delete=False) + ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name, + crush_file=crush_file.name), + stdout=subprocess.DEVNULL, + shell=True) + assert(ret == 0) + output = check_output("./crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name, + num_osd=len(osd_ids)), + stderr=subprocess.DEVNULL, + shell=True) + weights = [] + for line in output.strip().split('\n'): + osd_id, weight, osd_name = re.split('\s+', line) + weights.append(float(weight)) + return weights + + +def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths): + print "Testing get-osdmap and set-osdmap" + errors = 0 + kill_daemons() + weight = 1 / math.e # just some magic number in [0, 1] + changed = [] + for osd_path in osd_paths: + if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight): + changed.append(osd_path) + else: + logging.warning("Failed to change the weights: {0}".format(osd_path)) + # i am pissed off if none of the store gets changed + if not changed: + errors += 1 + + for osd_path in changed: + weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path) + if not weights: + errors += 1 + continue + if any(abs(w - weight) > 1e-5 for w in weights): + logging.warning("Weight is not changed: {0} != {1}".format(weights, weight)) + errors += 1 + return errors + +def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path): + # incrementals are not used unless we need to build an MOSDMap to update + # OSD's peers, so an obvious way to test it is simply overwrite an epoch + # with a different copy, and read it back to see if it matches. + kill_daemons() + file_e2 = tempfile.NamedTemporaryFile() + cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path, + file=file_e2.name) + output = check_output(cmd, shell=True) + epoch = int(re.findall('#(\d+)', output)[0]) + # backup e1 incremental before overwriting it + epoch -= 1 + file_e1_backup = tempfile.NamedTemporaryFile() + cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}" + ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True) + if ret: return 1 + # overwrite e1 with e2 + cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}" + ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True) + if ret: return 1 + # Use dry-run to set back to e1 which shouldn't happen + cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}" + ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True) + if ret: return 1 + # read from e1 + file_e1_read = tempfile.NamedTemporaryFile(delete=False) + cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}" + ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True) + if ret: return 1 + errors = 0 + try: + if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False): + logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name)) + errors += 1 + finally: + # revert the change with file_e1_backup + cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}" + ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True) + if ret: + logging.error("Failed to revert the changed inc-osdmap") + errors += 1 + return errors + + def main(argv): sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) - nullfd = open(os.devnull, "w") + if len(argv) > 1 and argv[1] == "debug": + nullfd = sys.stdout + else: + nullfd = open(os.devnull, "w") - call("rm -fr ceph_objectstore_tool_dir ; mkdir ceph_objectstore_tool_dir", shell=True) - os.environ["CEPH_DIR"] = CEPH_DIR; + call("rm -fr {dir}; mkdir {dir}".format(dir=CEPH_DIR), shell=True) + os.environ["CEPH_DIR"] = CEPH_DIR OSDDIR = os.path.join(CEPH_DIR, "dev") REP_POOL = "rep_pool" REP_NAME = "REPobject" @@ -287,6 +637,7 @@ def main(argv): NAME = REP_NAME + "{num}".format(num=i) LNAME = nspace + "-" + NAME DDNAME = os.path.join(DATADIR, LNAME) + DDNAME += "__head" cmd = "rm -f " + DDNAME logging.debug(cmd) @@ -306,7 +657,7 @@ def main(argv): logging.debug(cmd) ret = call(cmd, shell=True, stderr=nullfd) if ret != 0: - logging.critical("Replicated pool object creation failed with {ret}".format(ret=ret)) + logging.critical("Rados put command failed with {ret}".format(ret=ret)) return 1 db[nspace][NAME] = {} @@ -353,6 +704,45 @@ def main(argv): logging.critical("setomapval failed with {ret}".format(ret=ret)) db[nspace][NAME]["omap"][mykey] = myval + # Create some clones + cmd = "./rados -p {pool} mksnap snap1".format(pool=REP_POOL) + logging.debug(cmd) + call(cmd, shell=True) + + objects = range(1, NUM_REP_OBJECTS + 1) + nspaces = range(NUM_NSPACES) + for n in nspaces: + nspace = get_nspace(n) + + for i in objects: + NAME = REP_NAME + "{num}".format(num=i) + LNAME = nspace + "-" + NAME + DDNAME = os.path.join(DATADIR, LNAME) + # First clone + CLONENAME = DDNAME + "__1" + DDNAME += "__head" + + cmd = "mv -f " + DDNAME + " " + CLONENAME + logging.debug(cmd) + call(cmd, shell=True) + + if i == 1: + dataline = range(DATALINECOUNT) + else: + dataline = range(1) + fd = open(DDNAME, "w") + data = "This is the replicated data after a snapshot for " + LNAME + "\n" + for _ in dataline: + fd.write(data) + fd.close() + + cmd = "./rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace) + logging.debug(cmd) + ret = call(cmd, shell=True, stderr=nullfd) + if ret != 0: + logging.critical("Rados put command failed with {ret}".format(ret=ret)) + return 1 + print "Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES)) objects = range(1, NUM_EC_OBJECTS + 1) @@ -364,6 +754,7 @@ def main(argv): NAME = EC_NAME + "{num}".format(num=i) LNAME = nspace + "-" + NAME DDNAME = os.path.join(DATADIR, LNAME) + DDNAME += "__head" cmd = "rm -f " + DDNAME logging.debug(cmd) @@ -436,31 +827,64 @@ def main(argv): print "Test invalid parameters" # On export can't use stdout to a terminal cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True) # On export can't use stdout to a terminal cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True) + # Prep a valid ec export file for import failure tests + ONEECPG = ALLECPGS[0] + osds = get_osds(ONEECPG, OSDDIR) + ONEECOSD = osds[0] OTHERFILE = "/tmp/foo.{pid}".format(pid=pid) - foofd = open(OTHERFILE, "w") - foofd.close() + cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE) + logging.debug(cmd) + call(cmd, shell=True, stdout=nullfd, stderr=nullfd) - # On import can't specify a PG - cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {FOO}").format(osd=ONEOSD, pg=ONEPG, FOO=OTHERFILE) - ERRORS += test_failure(cmd, "--pgid option invalid with import") + # On import can't specify a different shard + BADPG = ONEECPG.split('s')[0] + "s10" + cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE) + ERRORS += test_failure(cmd, "Can't specify a different shard, must be") + + os.unlink(OTHERFILE) + + # Prep a valid export file for import failure tests + OTHERFILE = "/tmp/foo.{pid}".format(pid=pid) + cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE) + logging.debug(cmd) + call(cmd, shell=True, stdout=nullfd, stderr=nullfd) + + # On import can't specify a PG with a non-existent pool + cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE) + ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be") + + # On import can't specify shard for a replicated export + cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE) + ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export") + + # On import can't specify a PG with a bad seed + TMPPG="{pool}.80".format(pool=REPID) + cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE) + ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num") os.unlink(OTHERFILE) cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE) - ERRORS += test_failure(cmd, "open: No such file or directory") + ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE)) + + cmd = "./ceph-objectstore-tool --data-path BAD_DATA_PATH --journal-path " + OSDDIR + "/{osd}.journal --op list".format(osd=ONEOSD) + ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory") + + cmd = "./ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal" + ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory") # On import can't use stdin from a terminal cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True) # On import can't use stdin from a terminal cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG) - ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified") + ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True) # Specify a bad --type cmd = (CFSD_PREFIX + "--type foobar --op list --pgid {pg}").format(osd=ONEOSD, pg=ONEPG) @@ -474,18 +898,65 @@ def main(argv): cmd = "./ceph-objectstore-tool --type filestore --data-path {dir}/{osd} --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG) ERRORS += test_failure(cmd, "Must provide --journal-path") - # Test --op list and generate json for all objects + cmd = (CFSD_PREFIX + "--op remove").format(osd=ONEOSD) + ERRORS += test_failure(cmd, "Must provide pgid") + + # Don't secify a --op nor object command + cmd = CFSD_PREFIX.format(osd=ONEOSD) + ERRORS += test_failure(cmd, "Must provide --op or object command...") + + # Specify a bad --op command + cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD) + ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)") + + # Provide just the object param not a command + cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD) + ERRORS += test_failure(cmd, "Invalid syntax, missing command") + + # Provide an object name that doesn't exist + cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD) + ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found") + + # Provide an invalid object command + cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Unknown object command 'notacommand'") + + cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified") + + cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array") + + cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements") + + cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements") + + cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements") + + cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string") + + cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG) + ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4") + TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid) ALLPGS = OBJREPPGS + OBJECPGS - - print "Test --op list variants" OSDS = get_osds(ALLPGS[0], OSDDIR) osd = OSDS[0] + print "Test all --op dump-journal" + ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0] + ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS) + + # Test --op list and generate json for all objects + print "Test --op list variants" + # retrieve all objects from all PGs + tmpfd = open(TMPFILE, "w") cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd) - logging.debug(cmd); - tmpfd = open(TMPFILE, "a") logging.debug(cmd) ret = call(cmd, shell=True, stdout=tmpfd) if ret != 0: @@ -494,12 +965,11 @@ def main(argv): tmpfd.close() lines = get_lines(TMPFILE) JSONOBJ = sorted(set(lines)) - (pgid, jsondict) = json.loads(JSONOBJ[0])[0] + (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0] # retrieve all objects in a given PG - cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid) - logging.debug(cmd); tmpfd = open(OTHERFILE, "a") + cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid) logging.debug(cmd) ret = call(cmd, shell=True, stdout=tmpfd) if ret != 0: @@ -508,17 +978,16 @@ def main(argv): tmpfd.close() lines = get_lines(OTHERFILE) JSONOBJ = sorted(set(lines)) - (other_pgid, other_jsondict) = json.loads(JSONOBJ[0])[0] + (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0] - if pgid != other_pgid or jsondict != other_jsondict: + if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll: logging.error("the first line of --op list is different " "from the first line of --op list --pgid {pg}".format(pg=pgid)) ERRORS += 1 # retrieve all objects with a given name in a given PG + tmpfd = open(OTHERFILE, "w") cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid']) - logging.debug(cmd); - tmpfd = open(OTHERFILE, "a") logging.debug(cmd) ret = call(cmd, shell=True, stdout=tmpfd) if ret != 0: @@ -527,9 +996,9 @@ def main(argv): tmpfd.close() lines = get_lines(OTHERFILE) JSONOBJ = sorted(set(lines)) - (other_pgid, other_jsondict) in json.loads(JSONOBJ[0])[0] + (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0] - if pgid != other_pgid or jsondict != other_jsondict: + if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll: logging.error("the first line of --op list is different " "from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid'])) ERRORS += 1 @@ -538,8 +1007,8 @@ def main(argv): for pg in ALLPGS: OSDS = get_osds(pg, OSDDIR) for osd in OSDS: - cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg) tmpfd = open(TMPFILE, "a") + cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg) logging.debug(cmd) ret = call(cmd, shell=True, stdout=tmpfd) if ret != 0: @@ -551,6 +1020,9 @@ def main(argv): JSONOBJ = sorted(set(lines)) for JSON in JSONOBJ: (pgid, jsondict) = json.loads(JSON) + # Skip clones for now + if jsondict['snapid'] != -2: + continue db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict)) # print db[jsondict['namespace']][jsondict['oid']]['json'] if string.find(jsondict['oid'], EC_NAME) == 0 and 'shard_id' not in jsondict: @@ -561,7 +1033,7 @@ def main(argv): print "Test get-bytes and set-bytes" for nspace in db.keys(): for basename in db[nspace].keys(): - file = os.path.join(DATADIR, nspace + "-" + basename) + file = os.path.join(DATADIR, nspace + "-" + basename + "__head") JSON = db[nspace][basename]['json'] GETNAME = "/tmp/getbytes.{pid}".format(pid=pid) TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid) @@ -629,6 +1101,7 @@ def main(argv): if ret != 0: logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret)) ERRORS += 1 + fd.close() try: os.unlink(GETNAME) @@ -643,6 +1116,220 @@ def main(argv): except: pass + # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap + print "Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap" + for nspace in db.keys(): + for basename in db[nspace].keys(): + file = os.path.join(DATADIR, nspace + "-" + basename + "__head") + JSON = db[nspace][basename]['json'] + for pg in OBJREPPGS: + OSDS = get_osds(pg, OSDDIR) + for osd in OSDS: + DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg)))) + fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f)) + and f.split("_")[0] == basename and f.split("_")[4] == nspace] + if not fnames: + continue + for key, val in db[nspace][basename]["xattr"].iteritems(): + attrkey = "_" + key + cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey) + logging.debug(cmd) + getval = check_output(cmd, shell=True) + if getval != val: + logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val)) + ERRORS += 1 + continue + # set-attr to bogus value "foobar" + cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from set-attr".format(ret=ret)) + ERRORS += 1 + continue + # Test set-attr with dry-run + cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + if ret != 0: + logging.error("Bad exit status {ret} from set-attr".format(ret=ret)) + ERRORS += 1 + continue + # Check the set-attr + cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey) + logging.debug(cmd) + getval = check_output(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from get-attr".format(ret=ret)) + ERRORS += 1 + continue + if getval != "foobar": + logging.error("Check of set-attr failed because we got {val}".format(val=getval)) + ERRORS += 1 + continue + # Test rm-attr + cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from rm-attr".format(ret=ret)) + ERRORS += 1 + continue + # Check rm-attr with dry-run + cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + if ret != 0: + logging.error("Bad exit status {ret} from rm-attr".format(ret=ret)) + ERRORS += 1 + continue + cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey) + logging.debug(cmd) + ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd) + if ret == 0: + logging.error("For rm-attr expect get-attr to fail, but it succeeded") + ERRORS += 1 + # Put back value + cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from set-attr".format(ret=ret)) + ERRORS += 1 + continue + + hdr = db[nspace][basename].get("omapheader", "") + cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON) + logging.debug(cmd) + gethdr = check_output(cmd, shell=True) + if gethdr != hdr: + logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr)) + ERRORS += 1 + continue + # set-omaphdr to bogus value "foobar" + cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret)) + ERRORS += 1 + continue + # Check the set-omaphdr + cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON) + logging.debug(cmd) + gethdr = check_output(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret)) + ERRORS += 1 + continue + if gethdr != "foobar": + logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval)) + ERRORS += 1 + continue + # Test dry-run with set-omaphdr + cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + if ret != 0: + logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret)) + ERRORS += 1 + continue + # Put back value + cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret)) + ERRORS += 1 + continue + + for omapkey, val in db[nspace][basename]["omap"].iteritems(): + cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey) + logging.debug(cmd) + getval = check_output(cmd, shell=True) + if getval != val: + logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val)) + ERRORS += 1 + continue + # set-omap to bogus value "foobar" + cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from set-omap".format(ret=ret)) + ERRORS += 1 + continue + # Check set-omap with dry-run + cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + if ret != 0: + logging.error("Bad exit status {ret} from set-omap".format(ret=ret)) + ERRORS += 1 + continue + # Check the set-omap + cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey) + logging.debug(cmd) + getval = check_output(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from get-omap".format(ret=ret)) + ERRORS += 1 + continue + if getval != "foobar": + logging.error("Check of set-omap failed because we got {val}".format(val=getval)) + ERRORS += 1 + continue + # Test rm-omap + cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from rm-omap".format(ret=ret)) + ERRORS += 1 + # Check rm-omap with dry-run + cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + if ret != 0: + logging.error("Bad exit status {ret} from rm-omap".format(ret=ret)) + ERRORS += 1 + cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey) + logging.debug(cmd) + ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd) + if ret == 0: + logging.error("For rm-omap expect get-omap to fail, but it succeeded") + ERRORS += 1 + # Put back value + cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret} from set-omap".format(ret=ret)) + ERRORS += 1 + continue + + # Test dump + print "Test dump" + for nspace in db.keys(): + for basename in db[nspace].keys(): + file = os.path.join(DATADIR, nspace + "-" + basename + "__head") + JSON = db[nspace][basename]['json'] + GETNAME = "/tmp/getbytes.{pid}".format(pid=pid) + for pg in OBJREPPGS: + OSDS = get_osds(pg, OSDDIR) + for osd in OSDS: + DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg)))) + fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f)) + and f.split("_")[0] == basename and f.split("_")[4] == nspace] + if not fnames: + continue + cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Invalid dump for {json}".format(json=JSON)) + ERRORS += 1 + print "Test list-attrs get-attr" ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid) VALFILE = r"/tmp/val.{pid}".format(pid=pid) @@ -728,6 +1415,48 @@ def main(argv): logging.error("Not all keys found, remaining keys:") print values + print "Test --op meta-list" + tmpfd = open(TMPFILE, "w") + cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=tmpfd) + if ret != 0: + logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret)) + ERRORS += 1 + + print "Test get-bytes on meta" + tmpfd.close() + lines = get_lines(TMPFILE) + JSONOBJ = sorted(set(lines)) + for JSON in JSONOBJ: + (pgid, jsondict) = json.loads(JSON) + if pgid != "meta": + logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid)) + ERRORS += 1 + if jsondict['namespace'] != "": + logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace'])) + ERRORS += 1 + logging.info(JSON) + try: + os.unlink(GETNAME) + except: + pass + cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME) + logging.debug(cmd) + ret = call(cmd, shell=True) + if ret != 0: + logging.error("Bad exit status {ret}".format(ret=ret)) + ERRORS += 1 + + try: + os.unlink(GETNAME) + except: + pass + try: + os.unlink(TESTNAME) + except: + pass + print "Test pg info" for pg in ALLREPPGS + ALLECPGS: for osd in get_osds(pg, OSDDIR): @@ -776,7 +1505,7 @@ def main(argv): cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd) logging.debug(cmd) TEST_PGS = check_output(cmd, shell=True).split("\n") - TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line + TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line if TEST_PGS != CHECK_PGS: logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd)) @@ -798,7 +1527,7 @@ def main(argv): elif pg == ALLREPPGS[1]: cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname) else: - cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname) + cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname) logging.debug(cmd) ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd) if ret != 0: @@ -831,7 +1560,7 @@ def main(argv): if pg == PGS[0]: cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd) elif pg == PGS[1]: - cmd = (CFSD_PREFIX + "--op import --file - < {file}").format(osd=osd, file=file) + cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg) else: cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file) logging.debug(cmd) @@ -847,44 +1576,30 @@ def main(argv): if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0: print "Verify replicated import data" - for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(REP_NAME) == 0]: - nspace = nsfile.split("-")[0] - file = nsfile.split("-")[1] - path = os.path.join(DATADIR, nsfile) - tmpfd = open(TMPFILE, "w") - cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace) - logging.debug(cmd) - ret = call(cmd, shell=True, stdout=tmpfd) - if ret: - logging.critical("INTERNAL ERROR") - return 1 - tmpfd.close() - obj_locs = get_lines(TMPFILE) - if len(obj_locs) == 0: - logging.error("Can't find imported object {name}".format(name=file)) - ERRORS += 1 - for obj_loc in obj_locs: - cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc) - logging.debug(cmd) - ret = call(cmd, shell=True) - if ret != 0: - logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc)) - ERRORS += 1 + data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME) + ERRORS += data_errors else: logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES") + print "Test all --op dump-journal again" + ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0] + ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS) + vstart(new=False) wait_for_health() if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0: print "Verify erasure coded import data" - ERRORS += verify(DATADIR, EC_POOL, EC_NAME) + ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db) + # Check replicated data/xattr/omap using rados + print "Verify replicated import data using rados" + ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db) if EXP_ERRORS == 0: NEWPOOL = "import-rados-pool" cmd = "./rados mkpool {pool}".format(pool=NEWPOOL) logging.debug(cmd) - ret = call(cmd, shell=True, stdout=nullfd) + ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd) print "Test import-rados" for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]: @@ -900,13 +1615,144 @@ def main(argv): logging.error("Import-rados failed from {file} with {ret}".format(file=file, ret=ret)) ERRORS += 1 - ERRORS += verify(DATADIR, NEWPOOL, REP_NAME) + ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db) else: logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES") + # Clear directories of previous portion + call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True) + call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True) + os.mkdir(TESTDIR) + os.mkdir(DATADIR) + + # Cause SPLIT_POOL to split and test import with object/log filtering + print "Testing import all objects after a split" + SPLIT_POOL = "split_pool" + PG_COUNT = 1 + SPLIT_OBJ_COUNT = 5 + SPLIT_NSPACE_COUNT = 2 + SPLIT_NAME = "split" + cmd = "./ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT) + logging.debug(cmd) + call(cmd, shell=True, stdout=nullfd, stderr=nullfd) + SPLITID = get_pool_id(SPLIT_POOL, nullfd) + pool_size = int(check_output("./ceph osd pool get {pool} size".format(pool=SPLIT_POOL), shell=True, stderr=nullfd).split(" ")[1]) + EXP_ERRORS = 0 + RM_ERRORS = 0 + IMP_ERRORS = 0 + + objects = range(1, SPLIT_OBJ_COUNT + 1) + nspaces = range(SPLIT_NSPACE_COUNT) + for n in nspaces: + nspace = get_nspace(n) + + for i in objects: + NAME = SPLIT_NAME + "{num}".format(num=i) + LNAME = nspace + "-" + NAME + DDNAME = os.path.join(DATADIR, LNAME) + DDNAME += "__head" + + cmd = "rm -f " + DDNAME + logging.debug(cmd) + call(cmd, shell=True) + + if i == 1: + dataline = range(DATALINECOUNT) + else: + dataline = range(1) + fd = open(DDNAME, "w") + data = "This is the split data for " + LNAME + "\n" + for _ in dataline: + fd.write(data) + fd.close() + + cmd = "./rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace) + logging.debug(cmd) + ret = call(cmd, shell=True, stderr=nullfd) + if ret != 0: + logging.critical("Rados put command failed with {ret}".format(ret=ret)) + return 1 + + wait_for_health() + kill_daemons() + + for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]: + os.mkdir(os.path.join(TESTDIR, osd)) + + pg = "{pool}.0".format(pool=SPLITID) + EXPORT_PG = pg + + export_osds = get_osds(pg, OSDDIR) + for osd in export_osds: + mydir = os.path.join(TESTDIR, osd) + fname = os.path.join(mydir, pg) + cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd) + if ret != 0: + logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret)) + EXP_ERRORS += 1 + + ERRORS += EXP_ERRORS + + if EXP_ERRORS == 0: + vstart(new=False) + wait_for_health() + + time.sleep(20) + + cmd = "./ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd) + time.sleep(5) + wait_for_health() + + time.sleep(15) + + kill_daemons() + + # Now 2 PGs, poolid.0 and poolid.1 + for seed in range(2): + pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed) + + which = 0 + for osd in get_osds(pg, OSDDIR): + cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + + # This is weird. The export files are based on only the EXPORT_PG + # and where that pg was before the split. Use 'which' to use all + # export copies in import. + mydir = os.path.join(TESTDIR, export_osds[which]) + fname = os.path.join(mydir, EXPORT_PG) + which += 1 + cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname) + logging.debug(cmd) + ret = call(cmd, shell=True, stdout=nullfd) + if ret != 0: + logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret)) + IMP_ERRORS += 1 + + ERRORS += IMP_ERRORS + + # Start up again to make sure imports didn't corrupt anything + if IMP_ERRORS == 0: + print "Verify split import data" + data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME) + ERRORS += data_errors + if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size): + logging.error("Incorrect number of replicas seen {count}".format(count=count)) + ERRORS += 1 + vstart(new=False) + wait_for_health() + call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True) call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True) + # vstart() starts 4 OSDs + ERRORS += test_get_set_osdmap(CFSD_PREFIX, range(4), ALLOSDS) + ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0]) if ERRORS == 0: print "TEST PASSED" return 0 @@ -920,5 +1766,5 @@ if __name__ == "__main__": status = main(sys.argv[1:]) finally: kill_daemons() - call("/bin/rm -fr ceph_objectstore_tool_dir", shell=True) + call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True) sys.exit(status) diff --git a/ceph/src/test/cli/radosgw-admin/help.t b/ceph/src/test/cli/radosgw-admin/help.t index a2945494..dde20f8c 100644 --- a/ceph/src/test/cli/radosgw-admin/help.t +++ b/ceph/src/test/cli/radosgw-admin/help.t @@ -126,6 +126,11 @@ --caps= list of caps (e.g., "usage=read, write; user=read" --yes-i-really-mean-it required for certain operations --reset-regions reset regionmap when regionmap update + --bypass-gc when specified with bucket deletion, triggers + object deletions by not involving GC + --inconsistent-index when specified with bucket deletion and bypass-gc set to true, + ignores bucket index consistency + := "YYYY-MM-DD[ hh:mm:ss]" Quota options: diff --git a/ceph/src/test/common/test_bit_vector.cc b/ceph/src/test/common/test_bit_vector.cc index c58583c2..f5b0b26d 100644 --- a/ceph/src/test/common/test_bit_vector.cc +++ b/ceph/src/test/common/test_bit_vector.cc @@ -88,21 +88,22 @@ TYPED_TEST(BitVectorTest, get_set) { TYPED_TEST(BitVectorTest, get_buffer_extents) { typename TestFixture::bit_vector_t bit_vector; - uint64_t element_count = 2 * CEPH_PAGE_SIZE + 51; + uint64_t element_count = 2 * bit_vector.BLOCK_SIZE + 51; uint64_t elements_per_byte = 8 / bit_vector.BIT_COUNT; bit_vector.resize(element_count * elements_per_byte); - uint64_t offset = (CEPH_PAGE_SIZE + 11) * elements_per_byte; - uint64_t length = (CEPH_PAGE_SIZE + 31) * elements_per_byte; + uint64_t offset = (bit_vector.BLOCK_SIZE + 11) * elements_per_byte; + uint64_t length = (bit_vector.BLOCK_SIZE + 31) * elements_per_byte; uint64_t byte_offset; uint64_t byte_length; bit_vector.get_data_extents(offset, length, &byte_offset, &byte_length); - ASSERT_EQ(CEPH_PAGE_SIZE, byte_offset); - ASSERT_EQ(CEPH_PAGE_SIZE + (element_count % CEPH_PAGE_SIZE), byte_length); + ASSERT_EQ(bit_vector.BLOCK_SIZE, byte_offset); + ASSERT_EQ(bit_vector.BLOCK_SIZE + (element_count % bit_vector.BLOCK_SIZE), + byte_length); bit_vector.get_data_extents(1, 1, &byte_offset, &byte_length); ASSERT_EQ(0U, byte_offset); - ASSERT_EQ(CEPH_PAGE_SIZE, byte_length); + ASSERT_EQ(bit_vector.BLOCK_SIZE, byte_length); } TYPED_TEST(BitVectorTest, get_header_length) { @@ -155,11 +156,11 @@ TYPED_TEST(BitVectorTest, partial_decode_encode) { Extents extents = boost::assign::list_of( std::make_pair(0, 1))( - std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))( - std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))( - std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))( - std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))( - std::make_pair(2, 2 * CEPH_PAGE_SIZE)); + std::make_pair((bit_vector.BLOCK_SIZE * elements_per_byte) - 2, 4))( + std::make_pair((bit_vector.BLOCK_SIZE * elements_per_byte) + 2, 2))( + std::make_pair((2 * bit_vector.BLOCK_SIZE * elements_per_byte) - 2, 4))( + std::make_pair((2 * bit_vector.BLOCK_SIZE * elements_per_byte) + 2, 2))( + std::make_pair(2, 2 * bit_vector.BLOCK_SIZE)); for (Extents::iterator it = extents.begin(); it != extents.end(); ++it) { uint64_t element_offset = it->first; uint64_t element_length = it->second; @@ -224,8 +225,8 @@ TYPED_TEST(BitVectorTest, data_crc) { typename TestFixture::bit_vector_t bit_vector2; uint64_t elements_per_byte = 8 / bit_vector1.BIT_COUNT; - bit_vector1.resize((CEPH_PAGE_SIZE + 1) * elements_per_byte); - bit_vector2.resize((CEPH_PAGE_SIZE + 1) * elements_per_byte); + bit_vector1.resize((bit_vector1.BLOCK_SIZE + 1) * elements_per_byte); + bit_vector2.resize((bit_vector2.BLOCK_SIZE + 1) * elements_per_byte); uint64_t byte_offset; uint64_t byte_length; @@ -236,7 +237,7 @@ TYPED_TEST(BitVectorTest, data_crc) { bit_vector1.encode_data(data, byte_offset, byte_length); bufferlist::iterator data_it = data.begin(); - bit_vector1.decode_data(data_it, byte_offset); + bit_vector1.decode_data(data_it, byte_offset); bit_vector2[bit_vector2.size() - 1] = 1; diff --git a/ceph/src/test/crypto.cc b/ceph/src/test/crypto.cc index 24d5c5a4..17e90d04 100644 --- a/ceph/src/test/crypto.cc +++ b/ceph/src/test/crypto.cc @@ -3,7 +3,10 @@ #include "include/types.h" #include "auth/Crypto.h" +#include "common/Clock.h" #include "common/ceph_crypto.h" +#include "common/ceph_context.h" +#include "global/global_context.h" #include "test/unit.h" @@ -52,7 +55,9 @@ TEST(AES, Encrypt) { bufferlist cipher; std::string error; - h->encrypt(secret, plaintext, cipher, error); + CryptoKeyHandler *kh = h->get_key_handler(secret, error); + int r = kh->encrypt(plaintext, cipher, &error); + ASSERT_EQ(r, 0); ASSERT_EQ(error, ""); unsigned char want_cipher[] = { @@ -96,7 +101,9 @@ TEST(AES, Decrypt) { std::string error; bufferlist plaintext; - h->decrypt(secret, cipher, plaintext, error); + CryptoKeyHandler *kh = h->get_key_handler(secret, error); + int r = kh->decrypt(cipher, plaintext, &error); + ASSERT_EQ(r, 0); ASSERT_EQ(error, ""); ASSERT_EQ(sizeof(plaintext_s), plaintext.length()); @@ -128,7 +135,9 @@ TEST(AES, Loop) { CryptoHandler *h = g_ceph_context->get_crypto_handler(CEPH_CRYPTO_AES); std::string error; - h->encrypt(secret, plaintext, cipher, error); + CryptoKeyHandler *kh = h->get_key_handler(secret, error); + int r = kh->encrypt(plaintext, cipher, &error); + ASSERT_EQ(r, 0); ASSERT_EQ(error, ""); } plaintext.clear(); @@ -136,7 +145,9 @@ TEST(AES, Loop) { { CryptoHandler *h = g_ceph_context->get_crypto_handler(CEPH_CRYPTO_AES); std::string error; - h->decrypt(secret, cipher, plaintext, error); + CryptoKeyHandler *ckh = h->get_key_handler(secret, error); + int r = ckh->decrypt(cipher, plaintext, &error); + ASSERT_EQ(r, 0); ASSERT_EQ(error, ""); } } @@ -146,3 +157,28 @@ TEST(AES, Loop) { err = memcmp(plaintext_s, orig_plaintext_s, sizeof(orig_plaintext_s)); ASSERT_EQ(0, err); } + +TEST(AES, LoopKey) { + bufferptr k(16); + get_random_bytes(k.c_str(), k.length()); + CryptoKey key(CEPH_CRYPTO_AES, ceph_clock_now(NULL), k); + + bufferlist data; + bufferptr r(128); + get_random_bytes(r.c_str(), r.length()); + data.append(r); + + utime_t start = ceph_clock_now(NULL); + int n = 100000; + + for (int i=0; i_conf->set_val("num_client", "-1"); - ASSERT_EQ(ret, -EINVAL); - } - { - int ret = g_ceph_context->_conf->set_val("num_client", "-1K"); - ASSERT_EQ(ret, -EINVAL); - } { long long bad_value = (long long)std::numeric_limits::max() + 1; string str = boost::lexical_cast(bad_value); diff --git a/ceph/src/test/librados/cmd.cc b/ceph/src/test/librados/cmd.cc index 4f327a0e..0a7ed16a 100644 --- a/ceph/src/test/librados/cmd.cc +++ b/ceph/src/test/librados/cmd.cc @@ -49,6 +49,41 @@ TEST(LibRadosCmd, MonDescribe) { rados_buffer_free(buf); rados_buffer_free(st); + cmd[0] = (char *)""; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "{}", 2, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"abc\":\"something\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\" \"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\";;;,,,;;,,\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"extra command\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + cmd[0] = (char *)"{\"prefix\":\"mon_status\"}"; ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); ASSERT_LT(0u, buflen); diff --git a/ceph/src/test/librados/test.cc b/ceph/src/test/librados/test.cc index fd92fc23..b93b7543 100644 --- a/ceph/src/test/librados/test.cc +++ b/ceph/src/test/librados/test.cc @@ -44,12 +44,41 @@ std::string create_one_pool(const std::string &pool_name, rados_t *cluster) return ""; } -int destroy_ec_profile(rados_t *cluster) +int destroy_ec_profile(rados_t *cluster, std::ostream &oss) { - char *cmd[2]; - cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}"; - cmd[1] = NULL; - return rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0); + char *cmd[2]; + cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}"; + cmd[1] = NULL; + int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0); + if (ret) + oss << "rados_mon_command: erasure-code-profile rm testprofile failed with error " << ret; + return ret; +} + +int destroy_ruleset(rados_t *cluster, + std::string ruleset, + std::ostream &oss) +{ + char *cmd[2]; + std::string tmp = ("{\"prefix\": \"osd crush rule rm\", \"name\":\"" + + ruleset + "\"}"); + cmd[0] = (char*)tmp.c_str(); + cmd[1] = NULL; + int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0); + if (ret) + oss << "rados_mon_command: osd crush rule rm " + ruleset + " failed with error " << ret; + return ret; +} + +int destroy_ec_profile_and_ruleset(rados_t *cluster, + std::string ruleset, + std::ostream &oss) +{ + int ret; + ret = destroy_ec_profile(cluster, oss); + if (ret) + return ret; + return destroy_ruleset(cluster, ruleset, oss); } std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster) @@ -58,11 +87,10 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster) if (err.length()) return err; - int ret = destroy_ec_profile(cluster); + std::ostringstream oss; + int ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss); if (ret) { rados_shutdown(*cluster); - std::ostringstream oss; - oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret; return oss.str(); } @@ -73,8 +101,6 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster) cmd[0] = (char *)profile_create.c_str(); ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0); if (ret) { - std::ostringstream oss; - rados_shutdown(*cluster); oss << "rados_mon_command erasure-code-profile set name:testprofile failed with error " << ret; return oss.str(); @@ -85,12 +111,7 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster) cmd[0] = (char *)cmdstr.c_str(); ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0); if (ret) { - std::ostringstream oss; - - int ret2 = destroy_ec_profile(cluster); - if (ret2) - oss << "rados_mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl; - + destroy_ec_profile(cluster, oss); rados_shutdown(*cluster); oss << "rados_mon_command osd pool create failed with error " << ret; return oss.str(); @@ -115,11 +136,37 @@ std::string create_one_pool_pp(const std::string &pool_name, Rados &cluster) return ""; } -int destroy_ec_profile_pp(Rados &cluster) +int destroy_ruleset_pp(Rados &cluster, + std::string ruleset, + std::ostream &oss) +{ + bufferlist inbl; + int ret = cluster.mon_command("{\"prefix\": \"osd crush rule rm\", \"name\":\"" + + ruleset + "\"}", inbl, NULL, NULL); + if (ret) + oss << "mon_command: osd crush rule rm " + ruleset + " failed with error " << ret << std::endl; + return ret; +} + +int destroy_ec_profile_pp(Rados &cluster, std::ostream &oss) { bufferlist inbl; - return cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}", - inbl, NULL, NULL); + int ret = cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}", + inbl, NULL, NULL); + if (ret) + oss << "mon_command: osd erasure-code-profile rm testprofile failed with error " << ret << std::endl; + return ret; +} + +int destroy_ec_profile_and_ruleset_pp(Rados &cluster, + std::string ruleset, + std::ostream &oss) +{ + int ret; + ret = destroy_ec_profile_pp(cluster, oss); + if (ret) + return ret; + return destroy_ruleset_pp(cluster, ruleset, oss); } std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster) @@ -128,11 +175,10 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster) if (err.length()) return err; - int ret = destroy_ec_profile_pp(cluster); + std::ostringstream oss; + int ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss); if (ret) { cluster.shutdown(); - std::ostringstream oss; - oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret; return oss.str(); } @@ -142,7 +188,6 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster) inbl, NULL, NULL); if (ret) { cluster.shutdown(); - std::ostringstream oss; oss << "mon_command erasure-code-profile set name:testprofile failed with error " << ret; return oss.str(); } @@ -151,12 +196,8 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster) "{\"prefix\": \"osd pool create\", \"pool\": \"" + pool_name + "\", \"pool_type\":\"erasure\", \"pg_num\":8, \"pgp_num\":8, \"erasure_code_profile\":\"testprofile\"}", inbl, NULL, NULL); if (ret) { - std::ostringstream oss; bufferlist inbl; - int ret2 = destroy_ec_profile_pp(cluster); - if (ret2) - oss << "mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl; - + destroy_ec_profile_pp(cluster, oss); cluster.shutdown(); oss << "mon_command osd pool create pool:" << pool_name << " pool_type:erasure failed with error " << ret; return oss.str(); @@ -240,14 +281,19 @@ int destroy_one_pool(const std::string &pool_name, rados_t *cluster) int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster) { int ret = rados_pool_delete(*cluster, pool_name.c_str()); - if (ret == 0) { - int ret2 = destroy_ec_profile(cluster); - if (ret2) { - rados_shutdown(*cluster); - return ret2; - } - rados_wait_for_latest_osdmap(*cluster); + if (ret) { + rados_shutdown(*cluster); + return ret; + } + + std::ostringstream oss; + ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss); + if (ret) { + rados_shutdown(*cluster); + return ret; } + + rados_wait_for_latest_osdmap(*cluster); rados_shutdown(*cluster); return ret; } @@ -266,15 +312,19 @@ int destroy_one_pool_pp(const std::string &pool_name, Rados &cluster) int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster) { int ret = cluster.pool_delete(pool_name.c_str()); - bufferlist inbl; - if (ret == 0) { - int ret2 = destroy_ec_profile_pp(cluster); - if (ret2) { - cluster.shutdown(); - return ret2; - } - cluster.wait_for_latest_osdmap(); + if (ret) { + cluster.shutdown(); + return ret; } + + std::ostringstream oss; + ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss); + if (ret) { + cluster.shutdown(); + return ret; + } + + cluster.wait_for_latest_osdmap(); cluster.shutdown(); return ret; } diff --git a/ceph/src/test/librados/test.h b/ceph/src/test/librados/test.h index cd1f9817..0bb5355e 100644 --- a/ceph/src/test/librados/test.h +++ b/ceph/src/test/librados/test.h @@ -44,7 +44,7 @@ class TestAlarm { public: TestAlarm() { - alarm(360); + alarm(1200); } ~TestAlarm() { alarm(0); diff --git a/ceph/src/test/librbd/test_librbd.cc b/ceph/src/test/librbd/test_librbd.cc index 59981ea0..1e6f7a90 100644 --- a/ceph/src/test/librbd/test_librbd.cc +++ b/ceph/src/test/librbd/test_librbd.cc @@ -2486,6 +2486,52 @@ TEST_F(TestLibRBD, TestPendingAio) rados_ioctx_destroy(ioctx); } +TEST_F(TestLibRBD, Flatten) +{ + REQUIRE_FEATURE(RBD_FEATURE_LAYERING); + + librados::IoCtx ioctx; + ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx)); + + librbd::RBD rbd; + std::string parent_name = get_temp_image_name(); + uint64_t size = 2 << 20; + int order = 0; + ASSERT_EQ(0, create_image_pp(rbd, ioctx, parent_name.c_str(), size, &order)); + + librbd::Image parent_image; + ASSERT_EQ(0, rbd.open(ioctx, parent_image, parent_name.c_str(), NULL)); + + bufferlist bl; + bl.append(std::string(4096, '1')); + ASSERT_EQ(bl.length(), parent_image.write(0, bl.length(), bl)); + + ASSERT_EQ(0, parent_image.snap_create("snap1")); + ASSERT_EQ(0, parent_image.snap_protect("snap1")); + + uint64_t features; + ASSERT_EQ(0, parent_image.features(&features)); + + std::string clone_name = get_temp_image_name(); + EXPECT_EQ(0, rbd.clone(ioctx, parent_name.c_str(), "snap1", ioctx, + clone_name.c_str(), features, &order)); + + librbd::Image clone_image; + ASSERT_EQ(0, rbd.open(ioctx, clone_image, clone_name.c_str(), NULL)); + ASSERT_EQ(0, clone_image.flatten()); + + librbd::RBD::AioCompletion *read_comp = + new librbd::RBD::AioCompletion(NULL, NULL); + bufferlist read_bl; + clone_image.aio_read(0, bl.length(), read_bl, read_comp); + ASSERT_EQ(0, read_comp->wait_for_complete()); + ASSERT_EQ(bl.length(), read_comp->get_return_value()); + read_comp->release(); + ASSERT_TRUE(bl.contents_equal(read_bl)); + + ASSERT_PASSED(validate_object_map, clone_image); +} + TEST_F(TestLibRBD, SnapCreateViaLockOwner) { REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK); @@ -2865,3 +2911,48 @@ TEST_F(TestLibRBD, FlushEmptyOpsOnExternalSnapshot) { ASSERT_EQ(0, read_comp->wait_for_complete()); read_comp->release(); } + +TEST_F(TestLibRBD, FlushCacheWithCopyupOnExternalSnapshot) { + REQUIRE_FEATURE(RBD_FEATURE_LAYERING); + + librados::IoCtx ioctx; + ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx)); + + librbd::RBD rbd; + librbd::Image image; + std::string name = get_temp_image_name(); + + uint64_t size = 1 << 18; + int order = 0; + + ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order)); + ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL)); + + bufferlist bl; + bl.append(std::string(size, '1')); + ASSERT_EQ((int)size, image.write(0, size, bl)); + ASSERT_EQ(0, image.snap_create("one")); + ASSERT_EQ(0, image.snap_protect("one")); + + std::string clone_name = this->get_temp_image_name(); + ASSERT_EQ(0, rbd.clone(ioctx, name.c_str(), "one", ioctx, clone_name.c_str(), + RBD_FEATURE_LAYERING, &order)); + ASSERT_EQ(0, rbd.open(ioctx, image, clone_name.c_str(), NULL)); + + librbd::Image image2; + ASSERT_EQ(0, rbd.open(ioctx, image2, clone_name.c_str(), NULL)); + + // prepare CoW writeback that will be flushed on next op + bl.clear(); + bl.append(std::string(1, '1')); + ASSERT_EQ(0, image.flush()); + ASSERT_EQ(1, image.write(0, 1, bl)); + ASSERT_EQ(0, image2.snap_create("snap1")); + + librbd::RBD::AioCompletion *read_comp = + new librbd::RBD::AioCompletion(NULL, NULL); + bufferlist read_bl; + image.aio_read(0, 1024, read_bl, read_comp); + ASSERT_EQ(0, read_comp->wait_for_complete()); + read_comp->release(); +} diff --git a/ceph/src/test/mon/misc.sh b/ceph/src/test/mon/misc.sh index 0351bd48..5ab9d4ef 100755 --- a/ceph/src/test/mon/misc.sh +++ b/ceph/src/test/mon/misc.sh @@ -19,28 +19,28 @@ source test/mon/mon-test-helpers.sh function run() { local dir=$1 + shift export CEPH_MON="127.0.0.1:7102" export CEPH_ARGS CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " CEPH_ARGS+="--mon-host=$CEPH_MON " - setup $dir || return 1 - run_mon $dir a --public-addr $CEPH_MON - FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} - for TEST_function in $FUNCTIONS ; do - if ! $TEST_function $dir ; then - cat $dir/a/log - return 1 - fi + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 done - teardown $dir || return 1 } TEST_POOL=rbd function TEST_osd_pool_get_set() { - local dir=$1 flag + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + + local flag for flag in hashpspool nodelete nopgchange nosizechange; do if [ $flag = hashpspool ]; then ./ceph osd dump | grep 'pool 0' | grep $flag || return 1 @@ -82,9 +82,32 @@ function TEST_osd_pool_get_set() { ! ./ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1 ! ./ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1 + teardown $dir || return 1 + +} + +function TEST_no_segfault_for_bad_keyring() { + local dir=$1 + setup $dir || return 1 + # create a client.admin key and add it to ceph.mon.keyring + ceph-authtool --create-keyring $dir/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *' + ceph-authtool --create-keyring $dir/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *' + ceph-authtool $dir/ceph.mon.keyring --import-keyring $dir/ceph.client.admin.keyring + CEPH_ARGS_TMP="--fsid=$(uuidgen) --mon-host=127.0.0.1:7102 --auth-supported=cephx " + CEPH_ARGS_orig=$CEPH_ARGS + CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/ceph.mon.keyring " + run_mon $dir a + # create a bad keyring and make sure no segfault occurs when using the bad keyring + echo -e "[client.admin]\nkey = BQAUlgtWoFePIxAAQ9YLzJSVgJX5V1lh5gyctg==" > $dir/bad.keyring + CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/bad.keyring" + ceph osd dump 2> /dev/null + # 139(11|128) means segfault and core dumped + [ $? -eq 139 ] && return 1 + CEPH_ARGS=$CEPH_ARGS_orig + teardown $dir || return 1 } -main misc +main misc "$@" # Local Variables: # compile-command: "cd ../.. ; make -j4 && test/mon/misc.sh" diff --git a/ceph/src/test/objectstore/ObjectStoreTransactionBenchmark.cc b/ceph/src/test/objectstore/ObjectStoreTransactionBenchmark.cc index a82efb7e..55ffcf2b 100644 --- a/ceph/src/test/objectstore/ObjectStoreTransactionBenchmark.cc +++ b/ceph/src/test/objectstore/ObjectStoreTransactionBenchmark.cc @@ -252,6 +252,7 @@ int main(int argc, char **argv) global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); common_init_finish(g_ceph_context); g_ceph_context->_conf->apply_changes(NULL); + Cycles::init(); cerr << "args: " << args << std::endl; if (args.size() < 1) { diff --git a/ceph/src/test/objectstore/store_test.cc b/ceph/src/test/objectstore/store_test.cc index d3ce80ce..d4444443 100644 --- a/ceph/src/test/objectstore/store_test.cc +++ b/ceph/src/test/objectstore/store_test.cc @@ -342,6 +342,66 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) { } } +ghobject_t generate_long_name(unsigned i) +{ + stringstream name; + name << "object id " << i << " "; + for (unsigned j = 0; j < 500; ++j) name << 'a'; + ghobject_t hoid(hobject_t(sobject_t(name.str(), CEPH_NOSNAP))); + hoid.hobj.set_hash(i % 2); + return hoid; +} + +TEST_P(StoreTest, LongnameSplitTest) { + ObjectStore::Sequencer osr("test"); + int r; + coll_t cid; + { + ObjectStore::Transaction t; + t.create_collection(cid); + cerr << "Creating collection " << cid << std::endl; + r = store->apply_transaction(&osr, t); + ASSERT_EQ(r, 0); + } + for (unsigned i = 0; i < 320; ++i) { + ObjectStore::Transaction t; + ghobject_t hoid = generate_long_name(i); + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = store->apply_transaction(&osr, t); + } + + ghobject_t test_obj = generate_long_name(319); + ghobject_t test_obj_2 = test_obj; + test_obj_2.generation = 0; + test_obj_2.shard_id = shard_id_t(0); + { + ObjectStore::Transaction t; + // should cause a split + t.collection_move_rename( + cid, test_obj, + cid, test_obj_2); + r = store->apply_transaction(&osr, t); + } + + for (unsigned i = 0; i < 319; ++i) { + ObjectStore::Transaction t; + ghobject_t hoid = generate_long_name(i); + t.remove(cid, hoid); + cerr << "Removing object " << hoid << std::endl; + r = store->apply_transaction(&osr, t); + } + { + ObjectStore::Transaction t; + t.remove(cid, test_obj_2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = store->apply_transaction(&osr, t); + ASSERT_EQ(r, 0); + } + +} + TEST_P(StoreTest, ManyObjectTest) { int NUM_OBJS = 2000; int r = 0; @@ -472,6 +532,12 @@ public: name = "DIR_" + name; } + if (seq % 2) { + for (unsigned i = 0; i < 300; ++i) { + name.push_back('a'); + } + } + // hash //boost::binomial_distribution bin(0xFFFFFF, 0.5); ++seq; @@ -525,6 +591,38 @@ public: } }; + class C_SyntheticOnStash : public Context { + public: + SyntheticWorkloadState *state; + ObjectStore::Transaction *t; + ghobject_t oid, noid; + + C_SyntheticOnStash(SyntheticWorkloadState *state, + ObjectStore::Transaction *t, ghobject_t oid, + ghobject_t noid) + : state(state), t(t), oid(oid), noid(noid) {} + + void finish(int r) { + Mutex::Locker locker(state->lock); + ASSERT_TRUE(state->in_flight_objects.count(oid)); + ASSERT_EQ(r, 0); + state->in_flight_objects.erase(oid); + if (state->contents.count(noid)) + state->available_objects.insert(noid); + --(state->in_flight); + bufferlist r2; + r = state->store->read( + state->cid, noid, 0, + state->contents[noid].data.length(), r2); + if (!state->contents[noid].data.contents_equal(r2)) { + assert(0 == " mismatch after clone"); + ASSERT_TRUE(state->contents[noid].data.contents_equal(r2)); + } + state->cond.Signal(); + delete t; + } + }; + class C_SyntheticOnClone : public Context { public: SyntheticWorkloadState *state; @@ -628,6 +726,42 @@ public: return store->queue_transaction(osr, t, new C_SyntheticOnReadable(this, t, new_obj)); } + int stash() { + Mutex::Locker locker(lock); + if (!can_unlink()) + return -ENOENT; + if (!can_create()) + return -ENOSPC; + wait_for_ready(); + + ghobject_t old_obj; + int max = 20; + do { + old_obj = get_uniform_random_object(); + } while (--max && !contents[old_obj].data.length()); + available_objects.erase(old_obj); + ghobject_t new_obj = old_obj; + new_obj.generation++; + new_obj.shard_id = shard_id_t(0); + available_objects.erase(new_obj); + + ObjectStore::Transaction *t = new ObjectStore::Transaction; + t->collection_move_rename(cid, old_obj, cid, new_obj); + ++in_flight; + in_flight_objects.insert(old_obj); + + // *copy* the data buffer, since we may modify it later. + contents[new_obj].attrs = contents[old_obj].attrs; + contents[new_obj].data.clear(); + contents[new_obj].data.append(contents[old_obj].data.c_str(), + contents[old_obj].data.length()); + contents.erase(old_obj); + int status = store->queue_transaction( + osr, t, + new C_SyntheticOnStash(this, t, old_obj, new_obj)); + return status; + } + int clone() { Mutex::Locker locker(lock); if (!can_unlink()) @@ -998,6 +1132,8 @@ TEST_P(StoreTest, Synthetic) { test_obj.write(); } else if (val > 50) { test_obj.clone(); + } else if (val > 30) { + test_obj.stash(); } else if (val > 10) { test_obj.read(); } else { @@ -1036,6 +1172,8 @@ TEST_P(StoreTest, AttrSynthetic) { test_obj.setattrs(); } else if (val > 45) { test_obj.clone(); + } else if (val > 37) { + test_obj.stash(); } else if (val > 30) { test_obj.getattrs(); } else { diff --git a/ceph/src/test/osd/TestPGLog.cc b/ceph/src/test/osd/TestPGLog.cc index cc9733ac..5962b361 100644 --- a/ceph/src/test/osd/TestPGLog.cc +++ b/ceph/src/test/osd/TestPGLog.cc @@ -446,6 +446,40 @@ TEST_F(PGLogTest, rewind_divergent_log) { EXPECT_TRUE(dirty_info); EXPECT_TRUE(dirty_big_info); } + + // Test for 13965 + { + clear(); + + ObjectStore::Transaction t; + list remove_snap; + pg_info_t info; + info.log_tail = log.tail = eversion_t(1, 5); + info.last_update = eversion_t(1, 6); + bool dirty_info = false; + bool dirty_big_info = false; + + { + pg_log_entry_t e; + e.mod_desc.mark_unrollbackable(); + e.version = eversion_t(1, 5); + e.soid.set_hash(0x9); + add(e); + } + { + pg_log_entry_t e; + e.mod_desc.mark_unrollbackable(); + e.version = eversion_t(1, 6); + e.soid.set_hash(0x10); + add(e); + } + TestHandler h(remove_snap); + trim_rollback_info(eversion_t(1, 6), &h); + rewind_divergent_log(t, eversion_t(1, 5), info, &h, + dirty_info, dirty_big_info); + pg_log_t log; + claim_log_and_clear_rollback_info(log, &h); + } } TEST_F(PGLogTest, merge_old_entry) { diff --git a/ceph/src/test/osd/TestRados.cc b/ceph/src/test/osd/TestRados.cc index e8eb0db9..dd03d5f5 100644 --- a/ceph/src/test/osd/TestRados.cc +++ b/ceph/src/test/osd/TestRados.cc @@ -52,6 +52,13 @@ public: if (m_op <= m_objects) { stringstream oid; oid << m_op; + if (m_op % 2) { + // make it a long name + oid << " "; + for (unsigned i = 0; i < 300; ++i) { + oid << i; + } + } cout << m_op << ": write initial oid " << oid.str() << std::endl; context.oid_not_flushing.insert(oid.str()); if (m_ec_pool) { diff --git a/ceph/src/test/osd/osd-scrub-repair.sh b/ceph/src/test/osd/osd-scrub-repair.sh index 90c51c04..bd1dea5c 100755 --- a/ceph/src/test/osd/osd-scrub-repair.sh +++ b/ceph/src/test/osd/osd-scrub-repair.sh @@ -130,6 +130,52 @@ function TEST_corrupt_and_repair_erasure_coded() { teardown $dir || return 1 } +function TEST_unfound_erasure_coded() { + local dir=$1 + local poolname=ecpool + local payload=ABCDEF + + setup $dir || return 1 + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + run_osd $dir 3 || return 1 + wait_for_clean || return 1 + + ceph osd erasure-code-profile set myprofile \ + k=2 m=2 ruleset-failure-domain=osd || return 1 + ceph osd pool create $poolname 1 1 erasure myprofile \ + || return 1 + + add_something $dir $poolname + + local primary=$(get_primary $poolname SOMETHING) + local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//")) + local not_primary_first=${osds[0]} + local not_primary_second=${osds[1]} + local not_primary_third=${osds[2]} + + # + # 1) remove the corresponding file from the OSDs + # + objectstore_tool $dir $not_primary_first SOMETHING remove || return 1 + objectstore_tool $dir $not_primary_second SOMETHING remove || return 1 + objectstore_tool $dir $not_primary_third SOMETHING remove || return 1 + # + # 2) repair the PG + # + local pg=$(get_pg $poolname SOMETHING) + repair $pg + # + # 3) check pg state + # + ceph -s|grep "4 osds: 4 up, 4 in" || return 1 + ceph -s|grep "1/1 unfound" || return 1 + + teardown $dir || return 1 +} + function corrupt_and_repair_two() { local dir=$1 local poolname=$2 diff --git a/ceph/src/test/osd/osd-scrub-snaps.sh b/ceph/src/test/osd/osd-scrub-snaps.sh new file mode 100755 index 00000000..ed8e0af2 --- /dev/null +++ b/ceph/src/test/osd/osd-scrub-snaps.sh @@ -0,0 +1,227 @@ +#! /bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: David Zafman +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source test/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +function TEST_scrub_snaps() { + local dir=$1 + local poolname=test + + TESTDATA="testdata.$$" + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + + wait_for_clean || return 1 + + # Create a pool with a single pg + ceph osd pool create $poolname 1 1 + poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') + + dd if=/dev/urandom of=$TESTDATA bs=1032 count=1 + for i in `seq 1 14` + do + rados -p $poolname put obj${i} $TESTDATA + done + + SNAP=1 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj1 $TESTDATA + rados -p $poolname put obj5 $TESTDATA + rados -p $poolname put obj3 $TESTDATA + for i in `seq 6 14` + do rados -p $poolname put obj${i} $TESTDATA + done + + SNAP=2 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj5 $TESTDATA + + SNAP=3 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj3 $TESTDATA + + SNAP=4 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj5 $TESTDATA + rados -p $poolname put obj2 $TESTDATA + + SNAP=5 + rados -p $poolname mksnap snap${SNAP} + SNAP=6 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj5 $TESTDATA + + SNAP=7 + rados -p $poolname mksnap snap${SNAP} + + rados -p $poolname rm obj4 + rados -p $poolname rm obj2 + + kill_daemons $dir KILL osd || return 1 + sleep 5 + + # Don't need to ceph_objectstore_tool function because osd stopped + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj1 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)" + OBJ5SAVE="$JSON" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)" + dd if=/dev/urandom of=$TESTDATA bs=256 count=18 + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj3 | grep \"snapid\":-2)" + dd if=/dev/urandom of=$TESTDATA bs=256 count=15 + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj2 | grep \"snapid\":-1)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset + + # Create a clone which isn't in snapset and doesn't have object info + JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)" + dd if=/dev/urandom of=$TESTDATA bs=256 count=7 + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA + + rm -f $TESTDATA + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj6 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj7 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj8 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj9 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj10 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj11 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj12 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj13 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj14 | grep \"snapid\":-2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size + + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + + sleep 5 + ceph pg scrub ${poolid}.0 + timeout 30 ceph -w + + for i in `seq 1 7` + do + rados -p $poolname rmsnap snap$i + done + + sleep 10 + + ERRORS=0 + + pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') + pid=$(cat $pidfile) + if ! kill -0 $pid + then + echo "OSD crash occurred" + tail -100 $dir/osd.0.log + ERRORS=$(expr $ERRORS + 1) + fi + + kill_daemons $dir || return 1 + + declare -a err_strings + err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/2acecc8b/obj10/1 is missing in clone_overlap" + err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/7 no '_' attr" + err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/7 is an unexpected clone" + err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/4 on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]" + err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head expected clone [0-9]*/666934a3/obj5/2" + err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head expected clone [0-9]*/666934a3/obj5/1" + err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head 1 missing clone[(]s[)]" + err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/d3a9faf5/obj12/head snapset.head_exists=false, but head exists" + err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/8df7eaa5/obj8/head snaps.seq not set" + err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/5c889059/obj7/head snapset.head_exists=false, but head exists" + err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/5c889059/obj7/1 is an unexpected clone" + err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/61f68bb1/obj3/head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]" + err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/83425cc4/obj6/1 is an unexpected clone" + err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/snapdir no 'snapset' attr" + err_strings[14]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/7 clone ignored due to missing snapset" + err_strings[15]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/4 clone ignored due to missing snapset" + err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/a8759770/obj4/snapdir expected clone [0-9]*/a8759770/obj4/7" + err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/a8759770/obj4/snapdir 1 missing clone[(]s[)]" + err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/6cf8deff/obj1/1 is an unexpected clone" + err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/e478ac7f/obj9/1 is missing in clone_size" + err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/29547577/obj11/1 is an unexpected clone" + err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/94122507/obj14/1 size 1032 != clone_size 1033" + err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 19 errors" + + for i in `seq 0 ${#err_strings[@]}` + do + if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null; + then + echo "Missing log message '${err_strings[$i]}'" + ERRORS=$(expr $ERRORS + 1) + fi + done + + teardown $dir || return 1 + + if [ $ERRORS != "0" ]; + then + echo "TEST FAILED WITH $ERRORS ERRORS" + return 1 + fi + + echo "TEST PASSED" + return 0 +} + +main osd-scrub-snaps "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && \ +# test/osd/osd-scrub-snaps.sh" diff --git a/ceph/src/test/pybind/test_ceph_argparse.py b/ceph/src/test/pybind/test_ceph_argparse.py index eb79323c..5970576e 100755 --- a/ceph/src/test/pybind/test_ceph_argparse.py +++ b/ceph/src/test/pybind/test_ceph_argparse.py @@ -1086,9 +1086,8 @@ class TestOSD(TestArgparse): def test_reweight_by_utilization(self): self.assert_valid_command(['osd', 'reweight-by-utilization']) self.assert_valid_command(['osd', 'reweight-by-utilization', '100']) - assert_equal({}, validate_command(sigdict, ['osd', - 'reweight-by-utilization', - '50'])) + self.assert_valid_command(['osd', 'reweight-by-utilization', '100', '.1']) + self.assert_valid_command(['osd', 'reweight-by-utilization', '--no-increasing']) assert_equal({}, validate_command(sigdict, ['osd', 'reweight-by-utilization', '100', diff --git a/ceph/src/test/test_filejournal.cc b/ceph/src/test/test_filejournal.cc index befe7614..aaf64876 100644 --- a/ceph/src/test/test_filejournal.cc +++ b/ceph/src/test/test_filejournal.cc @@ -17,8 +17,14 @@ Finisher *finisher; Cond sync_cond; char path[200]; uuid_d fsid; -bool directio = false; -bool aio = false; +struct test_info { + bool directio, aio, faio; + const char *description; +} subtests[3] = { + { false, false, false, "DIRECTIO OFF AIO OFF" }, + { true, false, false, "DIRECTIO ON AIO OFF" }, + { true, true, true, "DIRECTIO ON AIO ON"} +}; // ---- Cond cond; @@ -95,21 +101,7 @@ int main(int argc, char **argv) { finisher->start(); - cout << "DIRECTIO OFF AIO OFF" << std::endl; - directio = false; - aio = false; int r = RUN_ALL_TESTS(); - if (r >= 0) { - cout << "DIRECTIO ON AIO OFF" << std::endl; - directio = true; - r = RUN_ALL_TESTS(); - - if (r >= 0) { - cout << "DIRECTIO ON AIO ON" << std::endl; - aio = true; - r = RUN_ALL_TESTS(); - } - } finisher->stop(); @@ -119,290 +111,366 @@ int main(int argc, char **argv) { } TEST(TestFileJournal, Create) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); + g_ceph_context->_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); + + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + } } TEST(TestFileJournal, WriteSmall) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); + g_ceph_context->_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); - bufferlist bl; - bl.append("small"); - j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); - wait(); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); - j.close(); + bufferlist bl; + bl.append("small"); + j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); + wait(); + + j.close(); + } } TEST(TestFileJournal, WriteBig) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); + g_ceph_context->_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); - bufferlist bl; - while (bl.length() < size_mb*1000/2) { - char foo[1024*1024]; - memset(foo, 1, sizeof(foo)); - bl.append(foo, sizeof(foo)); - } - j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); - wait(); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + bufferlist bl; + while (bl.length() < size_mb*1000/2) { + char foo[1024*1024]; + memset(foo, 1, sizeof(foo)); + bl.append(foo, sizeof(foo)); + } + j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); + wait(); - j.close(); + j.close(); + } } TEST(TestFileJournal, WriteMany) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); + g_ceph_context->_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); - C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); - - bufferlist bl; - bl.append("small"); - uint64_t seq = 1; - for (int i=0; i<100; i++) { + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + + bufferlist bl; bl.append("small"); - j.submit_entry(seq++, bl, 0, gb.new_sub()); - } + uint64_t seq = 1; + for (int i=0; i<100; i++) { + bl.append("small"); + j.submit_entry(seq++, bl, 0, gb.new_sub()); + } - gb.activate(); + gb.activate(); - wait(); + wait(); - j.close(); + j.close(); + } } TEST(TestFileJournal, WriteManyVecs) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); - - C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); - - bufferlist first; - first.append("small"); - j.submit_entry(1, first, 0, gb.new_sub()); - - bufferlist bl; - for (int i=0; i_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); + + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + bufferlist first; + first.append("small"); + j.submit_entry(1, first, 0, gb.new_sub()); + + bufferlist bl; + for (int i=0; i_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); + + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + + bufferlist bl; + bl.append("small"); + j.submit_entry(1, bl, 0, gb.new_sub()); + bl.append("small"); + j.submit_entry(2, bl, 0, gb.new_sub()); + bl.append("small"); + j.submit_entry(3, bl, 0, gb.new_sub()); + gb.activate(); + wait(); + + j.close(); + + j.open(1); + + bufferlist inbl; + string v; + uint64_t seq = 0; + ASSERT_EQ(true, j.read_entry(inbl, seq)); + ASSERT_EQ(seq, 2ull); + inbl.copy(0, inbl.length(), v); + ASSERT_EQ("small", v); + inbl.clear(); + v.clear(); + + ASSERT_EQ(true, j.read_entry(inbl, seq)); + ASSERT_EQ(seq, 3ull); + inbl.copy(0, inbl.length(), v); + ASSERT_EQ("small", v); + inbl.clear(); + v.clear(); + + ASSERT_TRUE(!j.read_entry(inbl, seq)); + + j.make_writeable(); + j.close(); + } } TEST(TestFileJournal, ReplayCorrupt) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); - - C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); - - const char *needle = "i am a needle"; - const char *newneedle = "in a haystack"; - bufferlist bl; - bl.append(needle); - j.submit_entry(1, bl, 0, gb.new_sub()); - bl.append(needle); - j.submit_entry(2, bl, 0, gb.new_sub()); - bl.append(needle); - j.submit_entry(3, bl, 0, gb.new_sub()); - bl.append(needle); - j.submit_entry(4, bl, 0, gb.new_sub()); - gb.activate(); - wait(); - - j.close(); - - cout << "corrupting journal" << std::endl; - char buf[1024*128]; - int fd = open(path, O_RDONLY); - ASSERT_GE(fd, 0); - int r = safe_read_exact(fd, buf, sizeof(buf)); - ASSERT_EQ(0, r); - int n = 0; - for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) { - if (memcmp(buf+o, needle, strlen(needle)) == 0) { - if (n >= 2) { + g_ceph_context->_conf->set_val("journal_ignore_corruption", "true"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); + + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + + const char *needle = "i am a needle"; + const char *newneedle = "in a haystack"; + bufferlist bl; + bl.append(needle); + j.submit_entry(1, bl, 0, gb.new_sub()); + bl.append(needle); + j.submit_entry(2, bl, 0, gb.new_sub()); + bl.append(needle); + j.submit_entry(3, bl, 0, gb.new_sub()); + bl.append(needle); + j.submit_entry(4, bl, 0, gb.new_sub()); + gb.activate(); + wait(); + + j.close(); + + cout << "corrupting journal" << std::endl; + char buf[1024*128]; + int fd = open(path, O_RDONLY); + ASSERT_GE(fd, 0); + int r = safe_read_exact(fd, buf, sizeof(buf)); + ASSERT_EQ(0, r); + int n = 0; + for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) { + if (memcmp(buf+o, needle, strlen(needle)) == 0) { + if (n >= 2) { cout << "replacing at offset " << o << std::endl; memcpy(buf+o, newneedle, strlen(newneedle)); - } else { + } else { cout << "leaving at offset " << o << std::endl; + } + n++; } - n++; } + ASSERT_EQ(n, 4); + close(fd); + fd = open(path, O_WRONLY); + ASSERT_GE(fd, 0); + r = safe_write(fd, buf, sizeof(buf)); + ASSERT_EQ(r, 0); + close(fd); + + j.open(1); + + bufferlist inbl; + string v; + uint64_t seq = 0; + ASSERT_EQ(true, j.read_entry(inbl, seq)); + ASSERT_EQ(seq, 2ull); + inbl.copy(0, inbl.length(), v); + ASSERT_EQ(needle, v); + inbl.clear(); + v.clear(); + bool corrupt; + ASSERT_FALSE(j.read_entry(inbl, seq, &corrupt)); + ASSERT_TRUE(corrupt); + + j.make_writeable(); + j.close(); } - ASSERT_EQ(n, 4); - close(fd); - fd = open(path, O_WRONLY); - ASSERT_GE(fd, 0); - r = safe_write(fd, buf, sizeof(buf)); - ASSERT_EQ(r, 0); - close(fd); - - j.open(1); - - bufferlist inbl; - string v; - uint64_t seq = 0; - ASSERT_EQ(true, j.read_entry(inbl, seq)); - ASSERT_EQ(seq, 2ull); - inbl.copy(0, inbl.length(), v); - ASSERT_EQ(needle, v); - inbl.clear(); - v.clear(); - ASSERT_TRUE(!j.read_entry(inbl, seq)); - - j.make_writeable(); - j.close(); } TEST(TestFileJournal, WriteTrim) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); + g_ceph_context->_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); - list ls; - - bufferlist bl; - char foo[1024*1024]; - memset(foo, 1, sizeof(foo)); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); - uint64_t seq = 1, committed = 0; + list ls; - for (unsigned i=0; ic); + bufferlist bl; + char foo[1024*1024]; + memset(foo, 1, sizeof(foo)); + + uint64_t seq = 1, committed = 0; + + for (unsigned i=0; ic); - while (ls.size() > size_mb/2) { + while (ls.size() > size_mb/2) { + delete ls.front(); + ls.pop_front(); + committed++; + j.committed_thru(committed); + } + } + + while (ls.size()) { delete ls.front(); ls.pop_front(); - committed++; - j.committed_thru(committed); + j.committed_thru(++committed); } - } - while (ls.size()) { - delete ls.front(); - ls.pop_front(); - j.committed_thru(committed); - } + ASSERT_TRUE(j.journalq_empty()); - j.close(); + j.close(); + } } TEST(TestFileJournal, WriteTrimSmall) { - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); + g_ceph_context->_conf->set_val("journal_ignore_corruption", "false"); + g_ceph_context->_conf->set_val("journal_write_header_frequency", "0"); + g_ceph_context->_conf->apply_changes(NULL); - list ls; - - bufferlist bl; - char foo[1024*1024]; - memset(foo, 1, sizeof(foo)); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); - uint64_t seq = 1, committed = 0; + list ls; - for (unsigned i=0; ic); + bufferlist bl; + char foo[1024*1024]; + memset(foo, 1, sizeof(foo)); - while (ls.size() > size_mb/2) { + uint64_t seq = 1, committed = 0; + + for (unsigned i=0; ic); + + while (ls.size() > size_mb/2) { + delete ls.front(); + ls.pop_front(); + committed++; + j.committed_thru(committed); + } + } + + while (ls.size()) { delete ls.front(); ls.pop_front(); - committed++; j.committed_thru(committed); } - } - while (ls.size()) { - delete ls.front(); - ls.pop_front(); - j.committed_thru(committed); + j.close(); } - - j.close(); } TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) { @@ -410,49 +478,53 @@ TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) { g_ceph_context->_conf->set_val("journal_write_header_frequency", "1"); g_ceph_context->_conf->apply_changes(NULL); - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); - - C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + + const char *needle = "i am a needle"; + for (unsigned i = 1; i <= 4; ++i) { + bufferlist bl; + bl.append(needle); + j.submit_entry(i, bl, 0, gb.new_sub()); + } + gb.activate(); + wait(); - const char *needle = "i am a needle"; - for (unsigned i = 1; i <= 4; ++i) { bufferlist bl; - bl.append(needle); - j.submit_entry(i, bl, 0, gb.new_sub()); + bl.append("needle"); + j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); + wait(); + + j.close(); + int fd = open(path, O_WRONLY); + + cout << "corrupting journal" << std::endl; + j.open(0); + j.corrupt_footer_magic(fd, 2); + + uint64_t seq = 0; + bl.clear(); + bool corrupt = false; + bool result = j.read_entry(bl, seq, &corrupt); + ASSERT_TRUE(result); + ASSERT_EQ(seq, 1UL); + ASSERT_FALSE(corrupt); + + result = j.read_entry(bl, seq, &corrupt); + ASSERT_FALSE(result); + ASSERT_TRUE(corrupt); + + j.make_writeable(); + j.close(); + ::close(fd); } - gb.activate(); - wait(); - - bufferlist bl; - bl.append("needle"); - j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); - wait(); - - j.close(); - int fd = open(path, O_WRONLY); - - cout << "corrupting journal" << std::endl; - j.open(0); - j.corrupt_footer_magic(fd, 2); - - uint64_t seq = 0; - bl.clear(); - bool corrupt = false; - bool result = j.read_entry(bl, seq, &corrupt); - ASSERT_TRUE(result); - ASSERT_EQ(seq, 1UL); - ASSERT_FALSE(corrupt); - - result = j.read_entry(bl, seq, &corrupt); - ASSERT_FALSE(result); - ASSERT_TRUE(corrupt); - - j.make_writeable(); - j.close(); - ::close(fd); } TEST(TestFileJournal, ReplayDetectCorruptPayload) { @@ -460,49 +532,53 @@ TEST(TestFileJournal, ReplayDetectCorruptPayload) { g_ceph_context->_conf->set_val("journal_write_header_frequency", "1"); g_ceph_context->_conf->apply_changes(NULL); - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); - - C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + + const char *needle = "i am a needle"; + for (unsigned i = 1; i <= 4; ++i) { + bufferlist bl; + bl.append(needle); + j.submit_entry(i, bl, 0, gb.new_sub()); + } + gb.activate(); + wait(); - const char *needle = "i am a needle"; - for (unsigned i = 1; i <= 4; ++i) { bufferlist bl; - bl.append(needle); - j.submit_entry(i, bl, 0, gb.new_sub()); + bl.append("needle"); + j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); + wait(); + + j.close(); + int fd = open(path, O_WRONLY); + + cout << "corrupting journal" << std::endl; + j.open(0); + j.corrupt_payload(fd, 2); + + uint64_t seq = 0; + bl.clear(); + bool corrupt = false; + bool result = j.read_entry(bl, seq, &corrupt); + ASSERT_TRUE(result); + ASSERT_EQ(seq, 1UL); + ASSERT_FALSE(corrupt); + + result = j.read_entry(bl, seq, &corrupt); + ASSERT_FALSE(result); + ASSERT_TRUE(corrupt); + + j.make_writeable(); + j.close(); + ::close(fd); } - gb.activate(); - wait(); - - bufferlist bl; - bl.append("needle"); - j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); - wait(); - - j.close(); - int fd = open(path, O_WRONLY); - - cout << "corrupting journal" << std::endl; - j.open(0); - j.corrupt_payload(fd, 2); - - uint64_t seq = 0; - bl.clear(); - bool corrupt = false; - bool result = j.read_entry(bl, seq, &corrupt); - ASSERT_TRUE(result); - ASSERT_EQ(seq, 1UL); - ASSERT_FALSE(corrupt); - - result = j.read_entry(bl, seq, &corrupt); - ASSERT_FALSE(result); - ASSERT_TRUE(corrupt); - - j.make_writeable(); - j.close(); - ::close(fd); } TEST(TestFileJournal, ReplayDetectCorruptHeader) { @@ -510,47 +586,51 @@ TEST(TestFileJournal, ReplayDetectCorruptHeader) { g_ceph_context->_conf->set_val("journal_write_header_frequency", "1"); g_ceph_context->_conf->apply_changes(NULL); - fsid.generate_random(); - FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); - ASSERT_EQ(0, j.create()); - j.make_writeable(); - - C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + for (unsigned i = 0 ; i < 3; ++i) { + SCOPED_TRACE(subtests[i].description); + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio, + subtests[i].aio, subtests[i].faio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done)); + + const char *needle = "i am a needle"; + for (unsigned i = 1; i <= 4; ++i) { + bufferlist bl; + bl.append(needle); + j.submit_entry(i, bl, 0, gb.new_sub()); + } + gb.activate(); + wait(); - const char *needle = "i am a needle"; - for (unsigned i = 1; i <= 4; ++i) { bufferlist bl; - bl.append(needle); - j.submit_entry(i, bl, 0, gb.new_sub()); + bl.append("needle"); + j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); + wait(); + + j.close(); + int fd = open(path, O_WRONLY); + + cout << "corrupting journal" << std::endl; + j.open(0); + j.corrupt_header_magic(fd, 2); + + uint64_t seq = 0; + bl.clear(); + bool corrupt = false; + bool result = j.read_entry(bl, seq, &corrupt); + ASSERT_TRUE(result); + ASSERT_EQ(seq, 1UL); + ASSERT_FALSE(corrupt); + + result = j.read_entry(bl, seq, &corrupt); + ASSERT_FALSE(result); + ASSERT_TRUE(corrupt); + + j.make_writeable(); + j.close(); + ::close(fd); } - gb.activate(); - wait(); - - bufferlist bl; - bl.append("needle"); - j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done)); - wait(); - - j.close(); - int fd = open(path, O_WRONLY); - - cout << "corrupting journal" << std::endl; - j.open(0); - j.corrupt_header_magic(fd, 2); - - uint64_t seq = 0; - bl.clear(); - bool corrupt = false; - bool result = j.read_entry(bl, seq, &corrupt); - ASSERT_TRUE(result); - ASSERT_EQ(seq, 1UL); - ASSERT_FALSE(corrupt); - - result = j.read_entry(bl, seq, &corrupt); - ASSERT_FALSE(result); - ASSERT_TRUE(corrupt); - - j.make_writeable(); - j.close(); - ::close(fd); } diff --git a/ceph/src/test/testcrypto.cc b/ceph/src/test/testcrypto.cc index 0b7a9d54..60f5905b 100644 --- a/ceph/src/test/testcrypto.cc +++ b/ceph/src/test/testcrypto.cc @@ -25,8 +25,8 @@ int main(int argc, char *argv[]) bufferlist enc_out; std::string error; - key.encrypt(g_ceph_context, enc_in, enc_out, error); - if (!error.empty()) { + if (key.encrypt(g_ceph_context, enc_in, enc_out, &error) < 0) { + assert(!error.empty()); dout(0) << "couldn't encode! error " << error << dendl; exit(1); } @@ -42,8 +42,8 @@ int main(int argc, char *argv[]) dec_in = enc_out; - key.decrypt(g_ceph_context, dec_in, dec_out, error); - if (!error.empty()) { + if (key.decrypt(g_ceph_context, dec_in, dec_out, &error) < 0) { + assert(!error.empty()); dout(0) << "couldn't decode! error " << error << dendl; exit(1); } diff --git a/ceph/src/tools/ceph-client-debug.cc b/ceph/src/tools/ceph-client-debug.cc index 2ed93326..a84cadcf 100644 --- a/ceph/src/tools/ceph-client-debug.cc +++ b/ceph/src/tools/ceph-client-debug.cc @@ -163,7 +163,7 @@ int main(int argc, const char **argv) // Release Inode references ceph_ll_forget(client, ino, 1); for (std::vector::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) { - ceph_ll_forget(client, (*p)->inode, 1); + ceph_ll_forget(client, (*p)->inode.get(), 1); } ino = NULL; path.clear(); diff --git a/ceph/src/tools/ceph_objectstore_tool.cc b/ceph/src/tools/ceph_objectstore_tool.cc index 9e689466..de791196 100644 --- a/ceph/src/tools/ceph_objectstore_tool.cc +++ b/ceph/src/tools/ceph_objectstore_tool.cc @@ -26,6 +26,7 @@ #include "os/ObjectStore.h" #include "os/FileStore.h" +#include "os/FileJournal.h" #include "osd/PGLog.h" #include "osd/OSD.h" @@ -91,6 +92,7 @@ const uint16_t shortmagic = 0xffce; //goes into stream as "ceff" const mymagic_t endmagic = (0xecff << 16) | shortmagic; const int fd_none = INT_MIN; bool outistty; +bool dry_run = false; //The first FIXED_LENGTH bytes are a fixed //portion of the export output. This includes the overall @@ -327,20 +329,23 @@ struct metadata_section { map past_intervals; OSDMap osdmap; bufferlist osdmap_bl; // Used in lieu of encoding osdmap due to crc checking + map divergent_priors; metadata_section(__u8 struct_ver, epoch_t map_epoch, const pg_info_t &info, - const pg_log_t &log, map &past_intervals) + const pg_log_t &log, map &past_intervals, + map &divergent_priors) : struct_ver(struct_ver), map_epoch(map_epoch), info(info), log(log), - past_intervals(past_intervals) { } + past_intervals(past_intervals), + divergent_priors(divergent_priors) { } metadata_section() : struct_ver(0), map_epoch(0) { } void encode(bufferlist& bl) const { - ENCODE_START(3, 1, bl); + ENCODE_START(4, 1, bl); ::encode(struct_ver, bl); ::encode(map_epoch, bl); ::encode(info, bl); @@ -349,10 +354,11 @@ struct metadata_section { // Equivalent to osdmap.encode(bl, features); but // preserving exact layout for CRC checking. bl.append(osdmap_bl); + ::encode(divergent_priors, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START(3, bl); + DECODE_START(4, bl); ::decode(struct_ver, bl); ::decode(map_epoch, bl); ::decode(info, bl); @@ -367,6 +373,9 @@ struct metadata_section { } else { cout << "WARNING: Older export without OSDMap information" << std::endl; } + if (struct_v > 3) { + ::decode(divergent_priors, bl); + } DECODE_FINISH(bl); } }; @@ -400,22 +409,24 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje ++obj) { if (obj->is_pgmeta()) continue; - bufferlist attr; - r = store->getattr(coll, *obj, OI_ATTR, attr); - if (r < 0) { - cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", " - << cpp_strerror(r) << std::endl; - return r; - } object_info_t oi; - bufferlist::iterator bp = attr.begin(); - try { - ::decode(oi, bp); - } catch (...) { - r = -EINVAL; - cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", " - << cpp_strerror(r) << std::endl; - return r; + if (coll != META_COLL) { + bufferlist attr; + r = store->getattr(coll, *obj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", " + << cpp_strerror(r) << std::endl; + continue; + } + bufferlist::iterator bp = attr.begin(); + try { + ::decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", " + << cpp_strerror(r) << std::endl; + continue; + } } r = action.call(store, coll, *obj, oi); if (r < 0) @@ -425,7 +436,54 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje return 0; } -int action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug) +int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug) +{ + spg_t pgid; + // Scan collections in case this is an ec pool but no shard specified + unsigned scanned = 0; + int r = 0; + vector colls_to_check; + vector candidates; + r = store->list_collections(candidates); + if (r < 0) { + cerr << "Error listing collections: " << cpp_strerror(r) << std::endl; + return r; + } + pgid.parse(pgidstr.c_str()); + for (vector::iterator i = candidates.begin(); + i != candidates.end(); + ++i) { + spg_t cand_pgid; + snapid_t snap; + if (!i->is_pg(cand_pgid, snap)) + continue; + if (snap != CEPH_NOSNAP) + continue; + + // If an exact match or treat no shard as any shard + if (cand_pgid == pgid || + (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) { + colls_to_check.push_back(*i); + } + } + + if (debug) + cerr << colls_to_check.size() << " pgs to scan" << std::endl; + for (vector::iterator i = colls_to_check.begin(); + i != colls_to_check.end(); + ++i, ++scanned) { + if (debug) + cerr << "Scanning " << *i << ", " << scanned << "/" + << colls_to_check.size() << " completed" << std::endl; + r = _action_on_all_objects_in_pg(store, *i, action, debug); + if (r < 0) + break; + } + store->sync_and_flush(); + return r; +} + +int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug) { int r = _action_on_all_objects_in_pg(store, coll, action, debug); store->sync_and_flush(); @@ -488,34 +546,42 @@ struct pgid_object_list { for (list >::const_iterator i = _objects.begin(); i != _objects.end(); ++i) { - if (i != _objects.begin() && human_readable) { - f->flush(cout); - cout << std::endl; - } f->open_array_section("pgid_object"); - string pgid = i->first.c_str(); - std::size_t pos = pgid.find("_"); - if (pos == string::npos) - f->dump_string("pgid", pgid); - else - f->dump_string("pgid", pgid.substr(0, pos)); + snapid_t snap; + spg_t pgid; + bool is_pg = i->first.is_pg(pgid, snap); + if (is_pg) + f->dump_string("pgid", stringify(pgid)); + if (!is_pg || !human_readable) + f->dump_string("coll", i->first.to_str()); f->open_object_section("ghobject"); i->second.dump(f); f->close_section(); f->close_section(); + if (human_readable) { + f->flush(cout); + cout << std::endl; + } } - if (!human_readable) + if (!human_readable) { f->close_section(); + f->flush(cout); + cout << std::endl; + } } }; struct lookup_ghobject : public action_on_object_t { pgid_object_list _objects; const string _name; + bool _need_snapset; - lookup_ghobject(const string& name) : _name(name) { } + lookup_ghobject(const string& name, bool need_snapset = false) : _name(name), + _need_snapset(need_snapset) { } virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) { + if (_need_snapset && !ghobj.hobj.has_snapset()) + return 0; if (_name.length() == 0 || ghobj.hobj.oid.name == _name) _objects.insert(coll, ghobj); return 0; @@ -547,6 +613,8 @@ uint64_t testalign; template int write_section(sectiontype_t type, const T& obj, int fd) { + if (dry_run) + return 0; bufferlist blhdr, bl, blftr; obj.encode(bl); header hdr(type, bl.length()); @@ -587,6 +655,8 @@ static void cleanbin(string &str) int write_simple(sectiontype_t type, int fd) { + if (dry_run) + return 0; bufferlist hbl; header hdr(type, 0); @@ -600,8 +670,8 @@ static int get_fd_data(int fd, bufferlist &bl) do { ssize_t bytes = bl.read_fd(fd, max_read); if (bytes < 0) { - cerr << "read_fd error " << cpp_strerror(-bytes) << std::endl; - return 1; + cerr << "read_fd error " << cpp_strerror(bytes) << std::endl; + return bytes; } if (bytes == 0) @@ -614,17 +684,24 @@ static int get_fd_data(int fd, bufferlist &bl) return 0; } +void myexit(int ret) +{ + if (g_ceph_context) + g_ceph_context->put(); + exit(ret); +} + static void invalid_filestore_path(string &path) { cerr << "Invalid filestore path specified: " << path << "\n"; - exit(1); + myexit(1); } int get_log(ObjectStore *fs, __u8 struct_ver, coll_t coll, spg_t pgid, const pg_info_t &info, - PGLog::IndexedLog &log, pg_missing_t &missing) + PGLog::IndexedLog &log, pg_missing_t &missing, + map &divergent_priors) { - map divergent_priors; try { ostringstream oss; assert(struct_ver > 0); @@ -637,11 +714,38 @@ int get_log(ObjectStore *fs, __u8 struct_ver, } catch (const buffer::error &e) { cerr << "read_log threw exception error " << e.what() << std::endl; - return 1; + return -EFAULT; } return 0; } +void dump_log(Formatter *formatter, ostream &out, pg_log_t &log, + pg_missing_t &missing, map &divergent_priors) +{ + formatter->open_object_section("op_log"); + formatter->open_object_section("pg_log_t"); + log.dump(formatter); + formatter->close_section(); + formatter->flush(out); + formatter->open_object_section("pg_missing_t"); + missing.dump(formatter); + formatter->close_section(); + formatter->flush(out); + formatter->open_object_section("map"); + formatter->open_array_section("divergent_priors"); + for (map::iterator it = divergent_priors.begin(); + it != divergent_priors.end(); ++ it) { + formatter->open_object_section("item"); + formatter->dump_stream("eversion") << it->first; + formatter->dump_stream("hobject") << it->second; + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->close_section(); + formatter->flush(out); +} + //Based on RemoveWQ::_process() void remove_coll(ObjectStore *store, const coll_t &coll) { @@ -696,7 +800,7 @@ int finish_remove_pgs(ObjectStore *store) vector ls; int r = store->list_collections(ls); if (r < 0) { - cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(-r) + cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r) << std::endl; return r; } @@ -747,7 +851,7 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t __u8 struct_v; r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v); if (r < 0) { - cerr << __func__ << " error on read_info " << cpp_strerror(-r) << std::endl; + cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl; return r; } if (struct_v < 8) { @@ -775,10 +879,14 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) { + if (!dry_run) + finish_remove_pgs(store); if (!store->collection_exists(coll_t(r_pgid))) return -ENOENT; cout << " marking collection for removal" << std::endl; + if (dry_run) + return 0; ObjectStore::Transaction *rmt = new ObjectStore::Transaction; int r = mark_pg_for_removal(store, r_pgid, rmt); if (r < 0) { @@ -786,6 +894,7 @@ int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) return r; } store->apply_transaction(*rmt); + finish_remove_pgs(store); return r; } @@ -798,7 +907,7 @@ int header::get_header() bytes = ebl.read_fd(file_fd, sh.header_size); if ((size_t)bytes != sh.header_size) { cerr << "Unexpected EOF" << std::endl; - return EFAULT; + return -EFAULT; } decode(ebliter); @@ -815,14 +924,14 @@ int footer::get_footer() bytes = ebl.read_fd(file_fd, sh.footer_size); if ((size_t)bytes != sh.footer_size) { cerr << "Unexpected EOF" << std::endl; - return EFAULT; + return -EFAULT; } decode(ebliter); if (magic != endmagic) { cerr << "Bad footer magic" << std::endl; - return EFAULT; + return -EFAULT; } return 0; @@ -839,18 +948,17 @@ int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, past_intervals, pgmeta_oid, true); - if (ret < 0) ret = -ret; if (ret) cerr << "Failed to write info" << std::endl; return ret; } int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, - pg_log_t &log, map &past_intervals) + pg_log_t &log, map &past_intervals, + map &divergent_priors) { int ret = write_info(t, epoch, info, past_intervals); if (ret) return ret; - map divergent_priors; coll_t coll(info.pgid); PGLog::write_log(t, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors); return 0; @@ -945,7 +1053,7 @@ int export_file(ObjectStore *store, coll_t cid, ghobject_t &obj) bufferlist hdrbuf; ret = store->omap_get_header(cid, obj, &hdrbuf, true); if (ret < 0) { - cerr << "omap_get_header: " << cpp_strerror(-ret) << std::endl; + cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl; return ret; } @@ -957,7 +1065,7 @@ int export_file(ObjectStore *store, coll_t cid, ghobject_t &obj) ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(cid, obj); if (!iter) { ret = -ENOENT; - cerr << "omap_get_iterator: " << cpp_strerror(-ret) << std::endl; + cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl; return ret; } iter->seek_to_first(); @@ -997,7 +1105,7 @@ int export_files(ObjectStore *store, coll_t coll) for (vector::iterator i = objects.begin(); i != objects.end(); ++i) { - if (i->is_pgmeta()) { + if (i->is_pgmeta() || i->hobj.is_temp()) { continue; } r = export_file(store, coll, *i); @@ -1008,13 +1116,96 @@ int export_files(ObjectStore *store, coll_t coll) return 0; } +int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { + OSDMap::Incremental inc; + bufferlist::iterator it = bl.begin(); + inc.decode(it); + if (e == 0) { + e = inc.epoch; + } else if (e != inc.epoch) { + cerr << "incremental.epoch mismatch: " + << inc.epoch << " != " << e << std::endl; + if (force) { + cerr << "But will continue anyway." << std::endl; + } else { + return -EINVAL; + } + } + const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e); + if (!store->exists(META_COLL, inc_oid)) { + cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl; + if (!force) { + return -ENOENT; + } + cout << "Creating a new epoch." << std::endl; + } + if (dry_run) + return 0; + ObjectStore::Transaction t; + t.write(META_COLL, inc_oid, 0, bl.length(), bl); + t.truncate(META_COLL, inc_oid, bl.length()); + int ret = store->apply_transaction(t); + if (ret) { + cerr << "Failed to set inc-osdmap (" << inc_oid << "): " << ret << std::endl; + } else { + cout << "Wrote inc-osdmap." << inc.epoch << std::endl; + } + return ret; +} + +int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl) +{ + if (store->read(META_COLL, + OSD::get_inc_osdmap_pobject_name(e), + 0, 0, bl) < 0) { + return -ENOENT; + } + return 0; +} + +int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { + OSDMap osdmap; + osdmap.decode(bl); + if (e == 0) { + e = osdmap.get_epoch(); + } else if (e != osdmap.get_epoch()) { + cerr << "osdmap.epoch mismatch: " + << e << " != " << osdmap.get_epoch() << std::endl; + if (force) { + cerr << "But will continue anyway." << std::endl; + } else { + return -EINVAL; + } + } + const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e); + if (!store->exists(META_COLL, full_oid)) { + cerr << "osdmap (" << full_oid << ") does not exist." << std::endl; + if (!force) { + return -ENOENT; + } + cout << "Creating a new epoch." << std::endl; + } + if (dry_run) + return 0; + ObjectStore::Transaction t; + t.write(META_COLL, full_oid, 0, bl.length(), bl); + t.truncate(META_COLL, full_oid, bl.length()); + int ret = store->apply_transaction(t); + if (ret) { + cerr << "Failed to set osdmap (" << full_oid << "): " << ret << std::endl; + } else { + cout << "Wrote osdmap." << osdmap.get_epoch() << std::endl; + } + return ret; +} + int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl) { bool found = store->read( META_COLL, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0; if (!found) { cerr << "Can't find OSDMap for pg epoch " << e << std::endl; - return ENOENT; + return -ENOENT; } osdmap.decode(bl); if (debug) @@ -1030,6 +1221,8 @@ int add_osdmap(ObjectStore *store, metadata_section &ms) //Write super_header with its fixed 16 byte length void write_super() { + if (dry_run) + return; bufferlist superbl; super_header sh; footer ft; @@ -1056,13 +1249,21 @@ int do_export(ObjectStore *fs, coll_t coll, spg_t pgid, pg_info_t &info, { PGLog::IndexedLog log; pg_missing_t missing; + map divergent_priors; cerr << "Exporting " << pgid << std::endl; - int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing); + int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing, + divergent_priors); if (ret > 0) return ret; + if (debug) { + Formatter *formatter = Formatter::create("json-pretty"); + assert(formatter); + dump_log(formatter, cerr, log, missing, divergent_priors); + delete formatter; + } write_super(); pg_begin pgb(pgid, superblock); @@ -1076,7 +1277,7 @@ int do_export(ObjectStore *fs, coll_t coll, spg_t pgid, pg_info_t &info, // The metadata_section is now before files, so import can detect // errors and abort without wasting time. - metadata_section ms(struct_ver, map_epoch, info, log, past_intervals); + metadata_section ms(struct_ver, map_epoch, info, log, past_intervals, divergent_priors); ret = add_osdmap(fs, ms); if (ret) return ret; @@ -1106,7 +1307,7 @@ int super_header::read_super() bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH); if ((size_t)bytes != super_header::FIXED_LENGTH) { cerr << "Unexpected EOF" << std::endl; - return EFAULT; + return -EFAULT; } decode(ebliter); @@ -1129,7 +1330,7 @@ int read_section(int fd, sectiontype_t *type, bufferlist *bl) bytes = bl->read_fd(fd, hdr.size); if (bytes != hdr.size) { cerr << "Unexpected EOF" << std::endl; - return EFAULT; + return -EFAULT; } if (hdr.size > 0) { @@ -1244,19 +1445,20 @@ int skip_object(bufferlist &bl) done = true; break; default: - return EFAULT; + return -EFAULT; } } return 0; } -int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) +int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite) { bufferlist::iterator ebliter = bl.begin(); object_begin ob; ob.decode(ebliter); map::iterator i; bufferlist abl; + bool skipping; data_section ds; attr_section as; @@ -1279,23 +1481,48 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) ioctx.set_namespace(ob.hoid.hobj.get_namespace()); string msg("Write"); - int ret = ioctx.create(ob.hoid.hobj.oid.name, true); - if (ret && ret != -EEXIST) { - cerr << "create failed: " << cpp_strerror(ret) << std::endl; - return ret; - } - if (ret == -EEXIST) { - msg = "***Overwrite***"; - ret = ioctx.remove(ob.hoid.hobj.oid.name); - if (ret < 0) { - cerr << "remove failed: " << cpp_strerror(ret) << std::endl; - return ret; + skipping = false; + if (dry_run) { + uint64_t psize; + time_t pmtime; + int ret = ioctx.stat(ob.hoid.hobj.oid.name, &psize, &pmtime); + if (ret == 0) { + if (no_overwrite) + // Could set skipping, but dry-run doesn't change anything either + msg = "Skipping existing"; + else + msg = "***Overwrite***"; } - ret = ioctx.create(ob.hoid.hobj.oid.name, true); - if (ret < 0) { + } else { + int ret = ioctx.create(ob.hoid.hobj.oid.name, true); + if (ret && ret != -EEXIST) { cerr << "create failed: " << cpp_strerror(ret) << std::endl; return ret; } + if (ret == -EEXIST) { + if (no_overwrite) { + msg = "Skipping existing"; + skipping = true; + } else { + msg = "***Overwrite***"; + ret = ioctx.remove(ob.hoid.hobj.oid.name); + if (ret < 0) { + cerr << "remove failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = ioctx.create(ob.hoid.hobj.oid.name, true); + // If object re-appeared after removal, let's just skip it + if (ret == -EEXIST) { + skipping = true; + msg = "Skipping in-use object"; + ret = 0; + } + if (ret < 0) { + cerr << "create failed: " << cpp_strerror(ret) << std::endl; + return ret; + } + } + } } cout << msg << " " << ob.hoid << std::endl; @@ -1337,7 +1564,7 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) if (need_align) { if (ds.offset != in_offset) { cerr << "Discontiguous object data in export" << std::endl; - return EFAULT; + return -EFAULT; } assert(ds.databl.length() == ds.len); databl.claim_append(ds.databl); @@ -1345,10 +1572,12 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) if (databl.length() >= alignment) { uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment; if (debug) cerr << "write offset=" << out_offset << " len=" << rndlen << std::endl; - ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset); - if (ret) { - cerr << "write failed: " << cpp_strerror(ret) << std::endl; - return ret; + if (!dry_run && !skipping) { + ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset); + if (ret) { + cerr << "write failed: " << cpp_strerror(ret) << std::endl; + return ret; + } } out_offset += rndlen; bufferlist n; @@ -1360,10 +1589,12 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) } break; } - ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset); - if (ret) { - cerr << "write failed: " << cpp_strerror(ret) << std::endl; - return ret; + if (!dry_run && !skipping) { + ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset); + if (ret) { + cerr << "write failed: " << cpp_strerror(ret) << std::endl; + return ret; + } } break; case TYPE_ATTRS: @@ -1371,8 +1602,12 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) if (debug) cerr << "\tattrs: len " << as.data.size() << std::endl; + if (dry_run || skipping) + break; for (i = as.data.begin(); i != as.data.end(); ++i) { - if (i->first == "_" || i->first == "snapset") + // The user xattrs that we want all begin with "_" with length > 1. + // Drop key "_" and all attributes that do not start with '_' + if (i->first == "_" || i->first[0] != '_') continue; abl.clear(); abl.push_front(i->second); @@ -1390,6 +1625,8 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) if (debug) cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length()) << std::endl; + if (dry_run || skipping) + break; ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr); if (ret) { cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl; @@ -1402,6 +1639,8 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) if (debug) cerr << "\tomap: size " << os.omap.size() << std::endl; + if (dry_run || skipping) + break; ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap); if (ret) { cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl; @@ -1410,25 +1649,28 @@ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl) } break; case TYPE_OBJECT_END: + done = true; if (need_align && databl.length() > 0) { assert(databl.length() < alignment); if (debug) cerr << "END write offset=" << out_offset << " len=" << databl.length() << std::endl; + if (dry_run || skipping) + break; ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset); if (ret) { cerr << "write failed: " << cpp_strerror(ret) << std::endl; return ret; } } - done = true; break; default: - return EFAULT; + return -EFAULT; } } return 0; } -int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap) +int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap, + bool *skipped_objects) { ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; @@ -1443,33 +1685,39 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap) coll.is_pg_prefix(pg); SnapMapper mapper(&driver, 0, 0, 0, pg.shard); + if (ob.hoid.hobj.is_temp()) { + cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl; + return -EFAULT; + } assert(g_ceph_context); if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) { object_t oid = ob.hoid.hobj.oid; object_locator_t loc(ob.hoid.hobj); pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); - + spg_t coll_pgid; snapid_t coll_snap; if (coll.is_pg(coll_pgid, coll_snap) == false) { cerr << "INTERNAL ERROR: Bad collection during import" << std::endl; - return 1; + return -EFAULT; } if (coll_pgid.shard != ob.hoid.shard_id) { - cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard + cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard << " but object shard is " << ob.hoid.shard_id << std::endl; - return 1; + return -EFAULT; } - + if (coll_pgid.pgid != pgid) { - cerr << "Skipping object '" << ob.hoid << "' which no longer belongs in exported pg" << std::endl; + cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl; + *skipped_objects = true; skip_object(bl); return 0; } } - t->touch(coll, ob.hoid); + if (!dry_run) + t->touch(coll, ob.hoid); cout << "Write " << ob.hoid << std::endl; @@ -1489,18 +1737,22 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap) } switch(type) { case TYPE_DATA: + if (dry_run) break; ret = get_data(store, coll, ob.hoid, t, ebl); if (ret) return ret; break; case TYPE_ATTRS: + if (dry_run) break; ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper); if (ret) return ret; break; case TYPE_OMAP_HDR: + if (dry_run) break; ret = get_omap_hdr(store, coll, ob.hoid, t, ebl); if (ret) return ret; break; case TYPE_OMAP: + if (dry_run) break; ret = get_omap(store, coll, ob.hoid, t, ebl); if (ret) return ret; break; @@ -1508,21 +1760,26 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap) done = true; break; default: - return EFAULT; + cerr << "Unknown section type " << type << std::endl; + return -EFAULT; } } - store->apply_transaction(*t); + if (!dry_run) + store->apply_transaction(*t); return 0; } int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms, - const OSDSuperblock& sb, OSDMap& curmap) + const OSDSuperblock& sb, OSDMap& curmap, spg_t pgid) { bufferlist::iterator ebliter = bl.begin(); ms.decode(ebliter); + spg_t old_pgid = ms.info.pgid; + ms.info.pgid = pgid; #if DIAGNOSTIC Formatter *formatter = new JSONFormatter(true); + cout << "export pgid " << old_pgid << std::endl; cout << "struct_v " << (int)ms.struct_ver << std::endl; cout << "map epoch " << ms.map_epoch << std::endl; @@ -1550,52 +1807,101 @@ int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms, formatter->close_section(); formatter->flush(cout); cout << std::endl; + + formatter->open_array_section("divergent_priors"); + for (map::iterator it = ms.divergent_priors.begin(); + it != ms.divergent_priors.end(); ++ it) { + formatter->open_object_section("item"); + formatter->dump_stream("eversion") << it->first; + formatter->dump_stream("hobject") << it->second; + formatter->close_section(); + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; #endif + if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) { + cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl; + return -EFAULT; + } + if (ms.map_epoch > sb.current_epoch) { - cerr << "ERROR: Export map_epoch " << ms.map_epoch << " > osd epoch " << sb.current_epoch << std::endl; - return 1; + cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl; + cerr << "The OSD you are using is older than the exported PG" << std::endl; + cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl; + return -EINVAL; } - // If the osdmap was present in the metadata we can check for splits. // Pool verified to exist for call to get_pg_num(). - if (ms.map_epoch < sb.current_epoch) { - bool found_map = false; + unsigned new_pg_num = curmap.get_pg_num(pgid.pgid.pool()); + + if (pgid.pgid.ps() >= new_pg_num) { + cerr << "Illegal pgid, the seed is larger than current pg_num" << std::endl; + return -EINVAL; + } + + // Old exports didn't include OSDMap, see if we have a copy locally + if (ms.osdmap.get_epoch() == 0) { OSDMap findmap; bufferlist findmap_bl; int ret = get_osdmap(store, ms.map_epoch, findmap, findmap_bl); - if (ret == 0) - found_map = true; - - // Old export didn't include OSDMap - if (ms.osdmap.get_epoch() == 0) { - // If we found the map locally and an older export didn't have it, - // then we'll use the local one. - if (found_map) { - ms.osdmap = findmap; - } else { - cerr << "WARNING: No OSDMap in old export," - " some objects may be ignored due to a split" << std::endl; - } + if (ret == 0) { + ms.osdmap = findmap; + } else { + cerr << "WARNING: No OSDMap in old export," + " some objects may be ignored due to a split" << std::endl; } + } + + // Make sure old_pg_num is 0 in the unusual case that OSDMap not in export + // nor can we find a local copy. + unsigned old_pg_num = 0; + if (ms.osdmap.get_epoch() != 0) + old_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool()); + + if (debug) { + cerr << "old_pg_num " << old_pg_num << std::endl; + cerr << "new_pg_num " << new_pg_num << std::endl; + cerr << ms.osdmap << std::endl; + cerr << curmap << std::endl; + } - // If OSDMap is available check for splits - if (ms.osdmap.get_epoch()) { - spg_t parent(ms.info.pgid); - if (parent.is_split(ms.osdmap.get_pg_num(ms.info.pgid.pgid.m_pool), - curmap.get_pg_num(ms.info.pgid.pgid.m_pool), NULL)) { - cerr << "WARNING: Split occurred, some objects may be ignored" << std::endl; + // If we have managed to have a good OSDMap we can do these checks + if (old_pg_num) { + if (old_pgid.pgid.ps() >= old_pg_num) { + cerr << "FATAL: pgid invalid for original map epoch" << std::endl; + return -EFAULT; + } + if (pgid.pgid.ps() >= old_pg_num) { + cout << "NOTICE: Post split pgid specified" << std::endl; + } else { + spg_t parent(pgid); + if (parent.is_split(old_pg_num, new_pg_num, NULL)) { + cerr << "WARNING: Split occurred, some objects may be ignored" << std::endl; } } } + if (debug) { + cerr << "Import pgid " << ms.info.pgid << std::endl; + cerr << "Clearing past_intervals " << ms.past_intervals << std::endl; + cerr << "Zero same_interval_since " << ms.info.history.same_interval_since << std::endl; + } + + // Let osd recompute past_intervals and same_interval_since ms.past_intervals.clear(); - ms.info.history.same_interval_since = ms.map_epoch = sb.current_epoch; + ms.info.history.same_interval_since = 0; + + if (debug) + cerr << "Changing pg epoch " << ms.map_epoch << " to " << sb.current_epoch << std::endl; + + ms.map_epoch = sb.current_epoch; return 0; } -int do_import_rados(string pool) +int do_import_rados(string pool, bool no_overwrite) { bufferlist ebl; pg_info_t info; @@ -1607,12 +1913,12 @@ int do_import_rados(string pool) if (sh.magic != super_header::super_magic) { cerr << "Invalid magic number" << std::endl; - return EFAULT; + return -EFAULT; } if (sh.version > super_header::super_ver) { cerr << "Can't handle export format version=" << sh.version << std::endl; - return EINVAL; + return -EINVAL; } //First section must be TYPE_PG_BEGIN @@ -1621,7 +1927,8 @@ int do_import_rados(string pool) if (ret) return ret; if (type != TYPE_PG_BEGIN) { - return EFAULT; + cerr << "Invalid first section type " << type << std::endl; + return -EFAULT; } bufferlist::iterator ebliter = ebl.begin(); @@ -1631,7 +1938,7 @@ int do_import_rados(string pool) if (!pgid.is_no_shard()) { cerr << "Importing Erasure Coded shard is not supported" << std::endl; - exit(1); + myexit(1); } if (debug) { @@ -1643,7 +1950,7 @@ int do_import_rados(string pool) if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) { cerr << "Export has incompatible features set " << pgb.superblock.compat_features << std::endl; - return 1; + return -EINVAL; } #endif @@ -1692,7 +1999,7 @@ int do_import_rados(string pool) } switch(type) { case TYPE_OBJECT_BEGIN: - ret = get_object_rados(ioctx, ebl); + ret = get_object_rados(ioctx, ebl, no_overwrite); if (ret) return ret; break; case TYPE_PG_METADATA: @@ -1704,7 +2011,7 @@ int do_import_rados(string pool) done = true; break; default: - return EFAULT; + return -EFAULT; } } @@ -1715,13 +2022,53 @@ int do_import_rados(string pool) return 0; } -int do_import(ObjectStore *store, OSDSuperblock& sb) + +typedef map divergent_priors_t; + +// out: pg_log_t that only has entries that apply to import_pgid using curmap +// reject: Entries rejected from "in" are in the reject.log. Other fields not set. +void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const divergent_priors_t &in, + divergent_priors_t &out, divergent_priors_t &reject) +{ + out.clear(); + reject.clear(); + + for (divergent_priors_t::const_iterator i = in.begin(); + i != in.end(); ++i) { + + // Reject divergent priors for temporary objects + if (i->second.is_temp()) { + reject.insert(*i); + continue; + } + + if (i->second.nspace != hit_set_namespace) { + object_t oid = i->second.oid; + object_locator_t loc(i->second); + pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); + pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); + + if (import_pgid.pgid == pgid) { + out.insert(*i); + } else { + reject.insert(*i); + } + } else { + out.insert(*i); + } + } +} + +int do_import(ObjectStore *store, OSDSuperblock& sb, bool force, string pgidstr) { bufferlist ebl; pg_info_t info; PGLog::IndexedLog log; + bool skipped_objects = false; - finish_remove_pgs(store); + if (!dry_run) + finish_remove_pgs(store); int ret = sh.read_super(); if (ret) @@ -1729,12 +2076,12 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) if (sh.magic != super_header::super_magic) { cerr << "Invalid magic number" << std::endl; - return EFAULT; + return -EFAULT; } if (sh.version > super_header::super_ver) { cerr << "Can't handle export format version=" << sh.version << std::endl; - return EINVAL; + return -EINVAL; } //First section must be TYPE_PG_BEGIN @@ -1743,19 +2090,48 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) if (ret) return ret; if (type != TYPE_PG_BEGIN) { - return EFAULT; + cerr << "Invalid first section type " << type << std::endl; + return -EFAULT; } bufferlist::iterator ebliter = ebl.begin(); pg_begin pgb; pgb.decode(ebliter); spg_t pgid = pgb.pgid; + spg_t orig_pgid = pgid; + + if (pgidstr.length()) { + spg_t user_pgid; + + bool ok = user_pgid.parse(pgidstr.c_str()); + // This succeeded in main() already + assert(ok); + if (pgid != user_pgid) { + if (pgid.pool() != user_pgid.pool()) { + cerr << "Can't specify a different pgid pool, must be " << pgid.pool() << std::endl; + return -EINVAL; + } + if (pgid.is_no_shard() && !user_pgid.is_no_shard()) { + cerr << "Can't specify a sharded pgid with a non-sharded export" << std::endl; + return -EINVAL; + } + // Get shard from export information if not specified + if (!pgid.is_no_shard() && user_pgid.is_no_shard()) { + user_pgid.shard = pgid.shard; + } + if (pgid.shard != user_pgid.shard) { + cerr << "Can't specify a different shard, must be " << pgid.shard << std::endl; + return -EINVAL; + } + pgid = user_pgid; + } + } if (!pgb.superblock.cluster_fsid.is_zero() && pgb.superblock.cluster_fsid != sb.cluster_fsid) { cerr << "Export came from different cluster with fsid " << pgb.superblock.cluster_fsid << std::endl; - return 1; + return -EINVAL; } if (debug) { @@ -1777,8 +2153,11 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) cerr << "OSD requires sharding to be enabled" << std::endl; cerr << std::endl; cerr << "If you wish to import, first do 'ceph-objectstore-tool...--op set-allow-sharded-objects'" << std::endl; + return -EINVAL; } - return 11; // Assume no +EAGAIN gets to end of main() until we clean up error code handling + // Let them import if they specify the --force option + if (!force) + return 11; // Positive return means exit status } // Don't import if pool no longer exists @@ -1792,7 +2171,7 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) if (!curmap.have_pg_pool(pgid.pgid.m_pool)) { cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl; // Special exit code for this error, used by test code - return 10; // Assume no +ECHILD gets to end of main() until we clean up error code handling + return 10; // Positive return means exit status } ghobject_t pgmeta_oid = pgid.make_pgmeta_oid(); @@ -1803,22 +2182,28 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) coll_t coll(pgid); if (store->collection_exists(coll)) { cerr << "pgid " << pgid << " already exists" << std::endl; - return 1; + return -EEXIST; } - ObjectStore::Transaction *t = new ObjectStore::Transaction; - PG::_create(*t, pgid); - PG::_init(*t, pgid, NULL); + if (!dry_run) { + ObjectStore::Transaction *t = new ObjectStore::Transaction; + PG::_create(*t, pgid); + PG::_init(*t, pgid, NULL); - // mark this coll for removal until we're done - map values; - ::encode((char)1, values["_remove"]); - t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values); + // mark this coll for removal until we're done + map values; + ::encode((char)1, values["_remove"]); + t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values); - store->apply_transaction(*t); - delete t; + store->apply_transaction(*t); + delete t; + } - cout << "Importing pgid " << pgid << std::endl; + cout << "Importing pgid " << pgid; + if (orig_pgid != pgid) { + cout << " exported as " << orig_pgid; + } + cout << std::endl; bool done = false; bool found_metadata = false; @@ -1835,11 +2220,11 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) } switch(type) { case TYPE_OBJECT_BEGIN: - ret = get_object(store, coll, ebl, curmap); + ret = get_object(store, coll, ebl, curmap, &skipped_objects); if (ret) return ret; break; case TYPE_PG_METADATA: - ret = get_pg_metadata(store, ebl, ms, sb, curmap); + ret = get_pg_metadata(store, ebl, ms, sb, curmap, pgid); if (ret) return ret; found_metadata = true; break; @@ -1847,51 +2232,79 @@ int do_import(ObjectStore *store, OSDSuperblock& sb) done = true; break; default: - return EFAULT; + cerr << "Unknown section type " << type << std::endl; + return -EFAULT; } } if (!found_metadata) { cerr << "Missing metadata section" << std::endl; - return EFAULT; - } + return -EFAULT; + } + + ObjectStore::Transaction t; + if (!dry_run) { + pg_log_t newlog, reject; + pg_log_t::filter_log(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace, + ms.log, newlog, reject); + if (debug) { + for (list::iterator i = newlog.log.begin(); + i != newlog.log.end(); ++i) + cerr << "Keeping log entry " << *i << std::endl; + for (list::iterator i = reject.log.begin(); + i != reject.log.end(); ++i) + cerr << "Skipping log entry " << *i << std::endl; + } - pg_log_t newlog, reject; - pg_log_t::filter_log(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace, - ms.log, newlog, reject); - if (debug) { - for (list::iterator i = newlog.log.begin(); - i != newlog.log.end(); ++i) - cerr << "Keeping log entry " << *i << std::endl; - for (list::iterator i = reject.log.begin(); - i != reject.log.end(); ++i) - cerr << "Skipping log entry " << *i << std::endl; - } + divergent_priors_t newdp, rejectdp; + filter_divergent_priors(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace, + ms.divergent_priors, newdp, rejectdp); + ms.divergent_priors = newdp; + if (debug) { + for (divergent_priors_t::iterator i = newdp.begin(); + i != newdp.end(); ++i) + cerr << "Keeping divergent_prior " << *i << std::endl; + for (divergent_priors_t::iterator i = rejectdp.begin(); + i != rejectdp.end(); ++i) + cerr << "Skipping divergent_prior " << *i << std::endl; + } - t = new ObjectStore::Transaction; - ret = write_pg(*t, ms.map_epoch, ms.info, newlog, ms.past_intervals); - if (ret) return ret; + if (debug) { + pg_missing_t missing; + Formatter *formatter = Formatter::create("json-pretty"); + dump_log(formatter, cerr, newlog, missing, ms.divergent_priors); + delete formatter; + } + + // Just like a split invalidate stats since the object count is changed + if (skipped_objects) + ms.info.stats.stats_invalid = true; + + ret = write_pg(t, ms.map_epoch, ms.info, newlog, ms.past_intervals, ms.divergent_priors); + if (ret) return ret; + } // done, clear removal flag if (debug) cerr << "done, clearing removal flag" << std::endl; - set remove; - remove.insert("_remove"); - t->omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove); - store->apply_transaction(*t); - delete t; + + if (!dry_run) { + set remove; + remove.insert("_remove"); + t.omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove); + store->apply_transaction(t); + } return 0; } -int do_list(ObjectStore *store, string pgidstr, string object, Formatter *formatter, bool debug, bool human_readable) +int do_list(ObjectStore *store, string pgidstr, string object, + Formatter *formatter, bool debug, bool human_readable, bool head) { int r; - lookup_ghobject lookup(object); + lookup_ghobject lookup(object, head); if (pgidstr.length() > 0) { - spg_t pgid; - pgid.parse(pgidstr.c_str()); - r = action_on_all_objects_in_pg(store, coll_t(pgid), lookup, debug); + r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug); } else { r = action_on_all_objects(store, lookup, debug); } @@ -1899,7 +2312,18 @@ int do_list(ObjectStore *store, string pgidstr, string object, Formatter *format return r; lookup.dump(formatter, human_readable); formatter->flush(cout); - cout << std::endl; + return 0; +} + +int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable) +{ + int r; + lookup_ghobject lookup(object); + r = action_on_all_objects_in_exact_pg(store, META_COLL, lookup, debug); + if (r) + return r; + lookup.dump(formatter, human_readable); + formatter->flush(cout); return 0; } @@ -1916,16 +2340,18 @@ int do_remove_object(ObjectStore *store, coll_t coll, ghobject_t &ghobj) int r = store->stat(coll, ghobj, &st); if (r < 0) { - cerr << "remove: " << cpp_strerror(-r) << std::endl; + cerr << "remove: " << cpp_strerror(r) << std::endl; return r; } + cout << "remove " << ghobj << std::endl; + if (dry_run) + return 0; ObjectStore::Transaction *t = new ObjectStore::Transaction; OSDriver::OSTransaction _t(driver.get_transaction(t)); - cout << "remove " << ghobj << std::endl; r = mapper.remove_oid(ghobj.hobj, &_t); - if (r != 0 && r != -ENOENT) { - cerr << "remove_oid returned " << cpp_strerror(-r) << std::endl; + if (r < 0 && r != -ENOENT) { + cerr << "remove_oid returned " << cpp_strerror(r) << std::endl; return r; } @@ -1941,7 +2367,7 @@ int do_list_attrs(ObjectStore *store, coll_t coll, ghobject_t &ghobj) map aset; int r = store->getattrs(coll, ghobj, aset); if (r < 0) { - cerr << "getattrs: " << cpp_strerror(-r) << std::endl; + cerr << "getattrs: " << cpp_strerror(r) << std::endl; return r; } @@ -1983,8 +2409,8 @@ int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) int ret = store->stat(coll, ghobj, &st); if (ret < 0) { - cerr << "get-bytes: " << cpp_strerror(-ret) << std::endl; - return 1; + cerr << "get-bytes: " << cpp_strerror(ret) << std::endl; + return ret; } total = st.st_size; @@ -2014,7 +2440,7 @@ int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) ret = write(fd, rawdatabl.c_str(), ret); if (ret == -1) { perror("write"); - return 1; + return -errno; } } @@ -2029,8 +2455,10 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) if (debug) cerr << "Write " << ghobj << std::endl; - t->touch(coll, ghobj); - t->truncate(coll, ghobj, 0); + if (!dry_run) { + t->touch(coll, ghobj); + t->truncate(coll, ghobj, 0); + } uint64_t offset = 0; bufferlist rawdatabl; @@ -2038,8 +2466,8 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) rawdatabl.clear(); ssize_t bytes = rawdatabl.read_fd(fd, max_read); if (bytes < 0) { - cerr << "read_fd error " << cpp_strerror(-bytes) << std::endl; - return 1; + cerr << "read_fd error " << cpp_strerror(bytes) << std::endl; + return bytes; } if (bytes == 0) @@ -2047,13 +2475,15 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) if (debug) cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl; - t->write(coll, ghobj, offset, bytes, rawdatabl); + if (!dry_run) + t->write(coll, ghobj, offset, bytes, rawdatabl); offset += bytes; // XXX: Should we apply_transaction() every once in a while for very large files } while(true); - store->apply_transaction(*t); + if (!dry_run) + store->apply_transaction(*t); return 0; } @@ -2063,7 +2493,7 @@ int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) int r = store->getattr(coll, ghobj, key.c_str(), bp); if (r < 0) { - cerr << "getattr: " << cpp_strerror(-r) << std::endl; + cerr << "getattr: " << cpp_strerror(r) << std::endl; return r; } @@ -2086,8 +2516,12 @@ int do_set_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, if (debug) cerr << "Setattr " << ghobj << std::endl; - if (get_fd_data(fd, bl)) - return 1; + int ret = get_fd_data(fd, bl); + if (ret < 0) + return ret; + + if (dry_run) + return 0; t->touch(coll, ghobj); @@ -2105,6 +2539,9 @@ int do_rm_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) if (debug) cerr << "Rmattr " << ghobj << std::endl; + if (dry_run) + return 0; + t->rmattr(coll, ghobj, key); store->apply_transaction(*t); @@ -2120,7 +2557,7 @@ int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) int r = store->omap_get_values(coll, ghobj, keys, &out); if (r < 0) { - cerr << "omap_get_values: " << cpp_strerror(-r) << std::endl; + cerr << "omap_get_values: " << cpp_strerror(r) << std::endl; return r; } @@ -2152,11 +2589,15 @@ int do_set_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, if (debug) cerr << "Set_omap " << ghobj << std::endl; - if (get_fd_data(fd, valbl)) - return 1; + int ret = get_fd_data(fd, valbl); + if (ret < 0) + return ret; attrset.insert(pair(key, valbl)); + if (dry_run) + return 0; + t->touch(coll, ghobj); t->omap_setkeys(coll, ghobj, attrset); @@ -2176,6 +2617,9 @@ int do_rm_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) if (debug) cerr << "Rm_omap " << ghobj << std::endl; + if (dry_run) + return 0; + t->omap_rmkeys(coll, ghobj, keys); store->apply_transaction(*t); @@ -2188,7 +2632,7 @@ int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj) int r = store->omap_get_header(coll, ghobj, &hdrbl, true); if (r < 0) { - cerr << "omap_get_header: " << cpp_strerror(-r) << std::endl; + cerr << "omap_get_header: " << cpp_strerror(r) << std::endl; return r; } @@ -2211,8 +2655,12 @@ int do_set_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) if (debug) cerr << "Omap_setheader " << ghobj << std::endl; - if (get_fd_data(fd, hdrbl)) - return 1; + int ret = get_fd_data(fd, hdrbl); + if (ret) + return ret; + + if (dry_run) + return 0; t->touch(coll, ghobj); @@ -2222,18 +2670,15 @@ int do_set_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) return 0; } -struct do_list_lost : public action_on_object_t { - virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) { - if (oi.is_lost()) - cout << coll << "/" << ghobj << " is lost" << std::endl; - return 0; - } -}; - struct do_fix_lost : public action_on_object_t { virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) { if (oi.is_lost()) { - cout << coll << "/" << ghobj << " is lost, fixing" << std::endl; + cout << coll << "/" << ghobj << " is lost"; + if (!dry_run) + cout << ", fixing"; + cout << std::endl; + if (dry_run) + return 0; oi.clear_flag(object_info_t::FLAG_LOST); bufferlist bl; ::encode(oi, bl); @@ -2251,6 +2696,334 @@ struct do_fix_lost : public action_on_object_t { } }; +int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false) +{ + bufferlist attr; + int r = store->getattr(coll, ghobj, SS_ATTR, attr); + if (r < 0) { + if (!silent) + cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + bufferlist::iterator bp = attr.begin(); + try { + ::decode(ss, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter) +{ + int r = 0; + formatter->open_object_section("obj"); + formatter->open_object_section("id"); + ghobj.dump(formatter); + formatter->close_section(); + + bufferlist attr; + int gr = store->getattr(coll, ghobj, OI_ATTR, attr); + if (gr < 0) { + r = gr; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } else { + object_info_t oi; + bufferlist::iterator bp = attr.begin(); + try { + ::decode(oi, bp); + formatter->open_object_section("info"); + oi.dump(formatter); + formatter->close_section(); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + } + struct stat st; + int sr = store->stat(coll, ghobj, &st, true); + if (sr < 0) { + r = sr; + cerr << "Error stat on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } else { + formatter->open_object_section("stat"); + formatter->dump_int("size", st.st_size); + formatter->dump_int("blksize", st.st_blksize); + formatter->dump_int("blocks", st.st_blocks); + formatter->dump_int("nlink", st.st_nlink); + formatter->close_section(); + } + + if (ghobj.hobj.has_snapset()) { + SnapSet ss; + int snr = get_snapset(store, coll, ghobj, ss); + if (snr < 0) { + r = snr; + } else { + formatter->open_object_section("SnapSet"); + ss.dump(formatter); + formatter->close_section(); + } + } + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + return r; +} + +int set_size(ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter) +{ + if (ghobj.hobj.is_snapdir()) { + cerr << "Can't set the size of a snapdir" << std::endl; + return -EINVAL; + } + bufferlist attr; + int r = store->getattr(coll, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + bufferlist::iterator bp = attr.begin(); + try { + ::decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + struct stat st; + r = store->stat(coll, ghobj, &st, true); + if (r < 0) { + cerr << "Error stat on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + ghobject_t head(ghobj); + SnapSet ss; + bool found_head = true; + map::iterator csi; + bool is_snap = ghobj.hobj.is_snap(); + if (is_snap) { + head.hobj = head.hobj.get_head(); + r = get_snapset(store, coll, head, ss, true); + if (r < 0 && r != -ENOENT) { + // Requested get_snapset() silent, so if not -ENOENT show error + cerr << "Error getting snapset on : " << make_pair(coll, head) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (r == -ENOENT) { + head.hobj = head.hobj.get_snapdir(); + r = get_snapset(store, coll, head, ss); + if (r < 0) + return r; + found_head = false; + } else { + found_head = true; + } + csi = ss.clone_size.find(ghobj.hobj.snap); + if (csi == ss.clone_size.end()) { + cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl; + return -EINVAL; + } + } + if ((uint64_t)st.st_size == setsize && oi.size == setsize + && (!is_snap || csi->second == setsize)) { + cout << "Size of object is already " << setsize << std::endl; + return 0; + } + cout << "Setting size to " << setsize << ", stat size " << st.st_size + << ", obj info size " << oi.size; + if (is_snap) { + cout << ", " << (found_head ? "head" : "snapdir") + << " clone_size " << csi->second; + csi->second = setsize; + } + cout << std::endl; + if (!dry_run) { + attr.clear(); + oi.size = setsize; + ::encode(oi, attr); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, OI_ATTR, attr); + t.truncate(coll, ghobj, setsize); + if (is_snap) { + bufferlist snapattr; + snapattr.clear(); + ::encode(ss, snapattr); + t.setattr(coll, head, SS_ATTR, snapattr); + } + r = store->apply_transaction(t); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, + string arg) +{ + SnapSet ss; + int ret = get_snapset(store, coll, ghobj, ss); + if (ret < 0) + return ret; + + // Use "head" to set head_exists incorrectly + if (arg == "corrupt" || arg == "head") + ss.head_exists = !ghobj.hobj.is_head(); + else if (ss.head_exists != ghobj.hobj.is_head()) { + cerr << "Correcting head_exists, set to " + << (ghobj.hobj.is_head() ? "true" : "false") << std::endl; + ss.head_exists = ghobj.hobj.is_head(); + } + // Use "corrupt" to clear entire SnapSet + // Use "seq" to just corrupt SnapSet.seq + if (arg == "corrupt" || arg == "seq") + ss.seq = 0; + // Use "snaps" to just clear SnapSet.snaps + if (arg == "corrupt" || arg == "snaps") + ss.snaps.clear(); + // By default just clear clone, clone_overlap and clone_size + if (arg == "corrupt") + arg = ""; + if (arg == "" || arg == "clones") + ss.clones.clear(); + if (arg == "" || arg == "clone_overlap") + ss.clone_overlap.clear(); + if (arg == "" || arg == "clone_size") + ss.clone_size.clear(); + // Break all clone sizes by adding 1 + if (arg == "size") { + for (map::iterator i = ss.clone_size.begin(); + i != ss.clone_size.end(); ++i) + ++(i->second); + } + + if (!dry_run) { + bufferlist bl; + ::encode(ss, bl); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, SS_ATTR, bl); + int r = store->apply_transaction(t); + if (r < 0) { + cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +vector::iterator find(vector &v, snapid_t clid) +{ + return std::find(v.begin(), v.end(), clid); +} + +map >::iterator +find(map > &m, snapid_t clid) +{ + return m.find(clid); +} + +map::iterator find(map &m, + snapid_t clid) +{ + return m.find(clid); +} + +template +int remove_from(T &mv, string name, snapid_t cloneid, bool force) +{ + typename T::iterator i = find(mv, cloneid); + if (i != mv.end()) { + mv.erase(i); + } else { + cerr << "Clone " << cloneid << " doesn't exist in " << name; + if (force) { + cerr << " (ignored)" << std::endl; + return 0; + } + cerr << std::endl; + return -EINVAL; + } + return 0; +} + +int remove_clone(ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force) +{ + // XXX: Don't allow this if in a cache tier or former cache tier + // bool allow_incomplete_clones() const { + // return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES); + + SnapSet snapset; + int ret = get_snapset(store, coll, ghobj, snapset); + if (ret < 0) + return ret; + + // Derived from trim_object() + // ...from snapset + vector::iterator p; + for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p) + if (*p == cloneid) + break; + if (p == snapset.clones.end()) { + cerr << "Clone " << cloneid << " not present"; + return -ENOENT; + } + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + vector::iterator n = p - 1; + hobject_t prev_coid = ghobj.hobj; + prev_coid.snap = *n; + //bool adjust_prev_bytes = is_present_clone(prev_coid); + + //if (adjust_prev_bytes) + // ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + + //if (adjust_prev_bytes) + // ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + + ret = remove_from(snapset.clones, "clones", cloneid, force); + if (ret) return ret; + ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force); + if (ret) return ret; + ret = remove_from(snapset.clone_size, "clone_size", cloneid, force); + if (ret) return ret; + + if (dry_run) + return 0; + + bufferlist bl; + ::encode(snapset, bl); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, SS_ATTR, bl); + int r = store->apply_transaction(t); + if (r < 0) { + cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + cout << "Removal of clone " << cloneid << " complete" << std::endl; + cout << "Use pg repair after OSD restarted to correct stat information" << std::endl; + return 0; +} + void usage(po::options_description &desc) { cerr << std::endl; @@ -2266,6 +3039,9 @@ void usage(po::options_description &desc) cerr << "ceph-objectstore-tool ... list-attrs" << std::endl; cerr << "ceph-objectstore-tool ... list-omap" << std::endl; cerr << "ceph-objectstore-tool ... remove" << std::endl; + cerr << "ceph-objectstore-tool ... dump" << std::endl; + cerr << "ceph-objectstore-tool ... set-size" << std::endl; + cerr << "ceph-objectstore-tool ... remove-clone-metadata " << std::endl; cerr << std::endl; cerr << "ceph-objectstore-tool import-rados [file]" << std::endl; cerr << std::endl; @@ -2273,10 +3049,11 @@ void usage(po::options_description &desc) cerr << "by --op list." << std::endl; cerr << " can be an object name which will be looked up in all" << std::endl; cerr << "the OSD's PGs." << std::endl; + cerr << " can be the empty string ('') which with a provided pgid " << std::endl; + cerr << "specifies the pgmeta object" << std::endl; cerr << std::endl; cerr << "The optional [file] argument will read stdin or write stdout" << std::endl; cerr << "if not specified or if '-' specified." << std::endl; - exit(1); } bool ends_with(const string& check, const string& ending) @@ -2284,13 +3061,30 @@ bool ends_with(const string& check, const string& ending) return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size()); } +// Based on FileStore::dump_journal(), set-up enough to only dump +int mydump_journal(Formatter *f, string journalpath, bool m_journal_dio) +{ + int r; + + if (!journalpath.length()) + return -EINVAL; + + FileJournal *journal = new FileJournal(uuid_d(), NULL, NULL, journalpath.c_str(), m_journal_dio); + r = journal->_fdump(*f, false); + delete journal; + return r; +} + int main(int argc, char **argv) { string dpath, jpath, pgidstr, op, file, object, objcmd, arg1, arg2, type, format; spg_t pgid; + unsigned epoch = 0; ghobject_t ghobj; - bool human_readable; + bool human_readable, no_overwrite; + bool force; Formatter *formatter; + bool head; po::options_description desc("Allowed options"); desc.add_options() @@ -2302,21 +3096,28 @@ int main(int argc, char **argv) ("journal-path", po::value(&jpath), "path to journal, mandatory for filestore type") ("pgid", po::value(&pgidstr), - "PG id, mandatory except for import, list-lost, fix-lost, list-pgs, set-allow-sharded-objects") + "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete") ("op", po::value(&op), - "Arg is one of [info, log, remove, export, import, list, list-lost, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects]") + "Arg is one of [info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, " + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete]") + ("epoch", po::value(&epoch), + "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified") ("file", po::value(&file), - "path of file to export or import") + "path of file to export, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap") ("format", po::value(&format)->default_value("json-pretty"), "Output format which may be json, json-pretty, xml, xml-pretty") ("debug", "Enable diagnostic output to stderr") + ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE") ("skip-journal-replay", "Disable journal replay") ("skip-mount-omap", "Disable mounting of omap") + ("head", "Find head/snapdir when searching for objects by name") + ("dry-run", "Don't modify the objectstore") + ("no-overwrite", "For import-rados don't overwrite existing files") ; po::options_description positional("Positional options"); positional.add_options() - ("object", po::value(&object), "object name or ghobject in json") + ("object", po::value(&object), "'' for pgmeta_oid, object name or ghobject in json") ("objcmd", po::value(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]") ("arg1", po::value(&arg1), "arg1 based on cmd") ("arg2", po::value(&arg2), "arg2 based on cmd") @@ -2340,11 +3141,12 @@ int main(int argc, char **argv) po::include_positional); } catch(po::error &e) { std::cerr << e.what() << std::endl; - return 1; + myexit(1); } if (vm.count("help")) { usage(desc); + myexit(1); } if (!vm.count("debug")) { @@ -2353,6 +3155,25 @@ int main(int argc, char **argv) debug = true; } + if (!vm.count("force")) { + force = false; + } else { + force = true; + } + + no_overwrite = false; + if (vm.count("no-overwrite")) + no_overwrite = true; + if (vm.count("dry-run")) + dry_run = true; + osflagbits_t flags = 0; + if (dry_run || vm.count("skip-journal-replay")) + flags |= SKIP_JOURNAL_REPLAY; + if (vm.count("skip-mount-omap")) + flags |= SKIP_MOUNT_OMAP; + + head = (vm.count("head") > 0); + vector ceph_options; env_to_vec(ceph_options); ceph_options.reserve(ceph_options.size() + ceph_option_strings.size()); @@ -2366,7 +3187,7 @@ int main(int argc, char **argv) if (object == "import-rados") { if (vm.count("objcmd") == 0) { cerr << "ceph-objectstore-tool import-rados [file]" << std::endl; - exit(1); + myexit(1); } string pool = objcmd; @@ -2381,67 +3202,73 @@ int main(int argc, char **argv) if (arg1 == "-") { if (isatty(STDIN_FILENO)) { cerr << "stdin is a tty and no file specified" << std::endl; - exit(1); + myexit(1); } file_fd = STDIN_FILENO; } else { file_fd = open(arg1.c_str(), O_RDONLY); if (file_fd < 0) { perror("open"); - return 1; + myexit(1); } } global_init(NULL, ceph_options, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); common_init_finish(g_ceph_context); - int ret = do_import_rados(pool); + int ret = do_import_rados(pool, no_overwrite); if (ret == 0) cout << "Import successful" << std::endl; - return ret != 0; + myexit(ret != 0); } - if (!vm.count("data-path")) { - cerr << "Must provide --data-path" << std::endl; - usage(desc); - } if (!vm.count("type")) { type = "filestore"; } - if (type == "filestore" && !vm.count("journal-path")) { - cerr << "Must provide --journal-path" << std::endl; + if (!vm.count("data-path") && + !(op == "dump-journal" && type == "filestore")) { + cerr << "Must provide --data-path" << std::endl; usage(desc); + myexit(1); } - if (op != "list" && vm.count("object") && !vm.count("objcmd")) { - cerr << "Invalid syntax, missing command" << std::endl; + if (type == "filestore" && !vm.count("journal-path")) { + cerr << "Must provide --journal-path" << std::endl; usage(desc); + myexit(1); } - if (!vm.count("op") && !(vm.count("object") && vm.count("objcmd"))) { + if (!vm.count("op") && !vm.count("object")) { cerr << "Must provide --op or object command..." << std::endl; usage(desc); + myexit(1); } if (op != "list" && vm.count("op") && vm.count("object")) { cerr << "Can't specify both --op and object command syntax" << std::endl; usage(desc); + myexit(1); + } + if (op != "list" && vm.count("object") && !vm.count("objcmd")) { + cerr << "Invalid syntax, missing command" << std::endl; + usage(desc); + myexit(1); } outistty = isatty(STDOUT_FILENO); file_fd = fd_none; - if (op == "export") { + if ((op == "export" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) { if (!vm.count("file") || file == "-") { if (outistty) { cerr << "stdout is a tty and no --file filename specified" << std::endl; - exit(1); + myexit(1); } file_fd = STDOUT_FILENO; } else { file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666); } - } else if (op == "import") { + } else if (op == "import" || op == "set-osdmap" || op == "set-inc-osdmap") { if (!vm.count("file") || file == "-") { if (isatty(STDIN_FILENO)) { cerr << "stdin is a tty and no --file filename specified" << std::endl; - exit(1); + myexit(1); } file_fd = STDIN_FILENO; } else { @@ -2449,27 +3276,18 @@ int main(int argc, char **argv) } } - if (vm.count("file") && file_fd == fd_none) { - cerr << "--file option only applies to import or export" << std::endl; - return 1; + if (vm.count("file") && file_fd == fd_none && !dry_run) { + cerr << "--file option only applies to import, export, " + << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl; + myexit(1); } if (file_fd != fd_none && file_fd < 0) { - perror("open"); - return 1; + string err = string("file: ") + file; + perror(err.c_str()); + myexit(1); } - if (dpath.length() == 0) { - cerr << "Invalid params" << std::endl; - return 1; - } - - osflagbits_t flags = 0; - if (vm.count("skip-journal-replay")) - flags |= SKIP_JOURNAL_REPLAY; - if (vm.count("skip-mount-omap")) - flags |= SKIP_MOUNT_OMAP; - global_init( NULL, ceph_options, CEPH_ENTITY_TYPE_OSD, CODE_ENVIRONMENT_UTILITY_NODOUT, 0); @@ -2482,11 +3300,38 @@ int main(int argc, char **argv) } g_conf->apply_changes(NULL); + // Special list handling. Treating pretty_format as human readable, + // with one object per line and not an enclosing array. + human_readable = ends_with(format, "-pretty"); + if ((op == "list" || op == "meta-list") && human_readable) { + // Remove -pretty from end of format which we know is there + format = format.substr(0, format.size() - strlen("-pretty")); + } + + formatter = Formatter::create(format); + if (formatter == NULL) { + cerr << "unrecognized format: " << format << std::endl; + myexit(1); + } + + // Special handling for filestore journal, so we can dump it without mounting + if (op == "dump-journal" && type == "filestore") { + int ret = mydump_journal(formatter, jpath, g_conf->journal_dio); + if (ret < 0) { + cerr << "journal-path: " << jpath << ": " + << cpp_strerror(ret) << std::endl; + myexit(1); + } + formatter->flush(cout); + myexit(0); + } + //Verify that data-path really exists struct stat st; if (::stat(dpath.c_str(), &st) == -1) { - perror("data-path"); - exit(1); + string err = string("data-path: ") + dpath; + perror(err.c_str()); + myexit(1); } //Verify data data-path really is a filestore if (type == "filestore") { @@ -2511,30 +3356,34 @@ int main(int argc, char **argv) } } - if (op == "import" && pgidstr.length()) { - cerr << "--pgid option invalid with import" << std::endl; - return 1; + if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) { + cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl; + myexit(1); } ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags); if (fs == NULL) { cerr << "Must provide --type (filestore, memstore, keyvaluestore)" << std::endl; - exit(1); + if (type == "keyvaluestore") { + cerr << "Add \"keyvaluestore\" to " + << "enable_experimental_unrecoverable_data_corrupting_features" + << std::endl; + } + myexit(1); } - int r = fs->mount(); - if (r < 0) { - if (r == -EBUSY) { + int ret = fs->mount(); + if (ret < 0) { + if (ret == -EBUSY) { cerr << "OSD has the store locked" << std::endl; } else { - cerr << "Mount failed with '" << cpp_strerror(-r) << "'" << std::endl; + cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl; } - return 1; + myexit(1); } bool fs_sharded_objects = fs->get_allow_sharded_objects(); - int ret = 0; vector ls; vector::iterator it; CompatSet supported; @@ -2548,9 +3397,9 @@ int main(int argc, char **argv) bufferlist bl; OSDSuperblock superblock; bufferlist::iterator p; - r = fs->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl); - if (r < 0) { - cerr << "Failure to read OSD superblock error= " << r << std::endl; + ret = fs->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl); + if (ret < 0) { + cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl; goto out; } @@ -2576,30 +3425,33 @@ int main(int argc, char **argv) CompatSet unsupported = supported.unsupported(superblock.compat_features); cerr << "On-disk OSD incompatible features set " << unsupported << std::endl; - ret = EINVAL; + ret = -EINVAL; goto out; } - if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) { - cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl; - return 1; - } - if (op != "list" && vm.count("object")) { + // Special case: Create pgmeta_oid if empty string specified + // This can't conflict with any actual object names. + if (object == "") { + ghobj = pgid.make_pgmeta_oid(); + } else { json_spirit::Value v; try { if (!json_spirit::read(object, v)) { - lookup_ghobject lookup(object); + // Special: Need head/snapdir so set even if user didn't specify + if (vm.count("objcmd") && (objcmd == "remove-clone-metadata")) + head = true; + lookup_ghobject lookup(object, head); if (action_on_all_objects(fs, lookup, debug)) { throw std::runtime_error("Internal error"); } else { if (lookup.size() != 1) { stringstream ss; if (lookup.size() == 0) - ss << objcmd << ": " << cpp_strerror(ENOENT); + ss << "No object id '" << object << "' found or invalid JSON specified"; else - ss << "expected a single object named '" << object - << "' but got " << lookup.size() << " instead"; + ss << "Found " << lookup.size() << " objects with id '" << object + << "', please use a JSON spec from --op list instead"; throw std::runtime_error(ss.str()); } pair found = lookup.pop(); @@ -2610,44 +3462,52 @@ int main(int argc, char **argv) } else { stringstream ss; if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) { - ss << "object '" << object - << "' must be a JSON array but is of type " - << v.type() << " instead"; + ss << "Without --pgid the object '" << object + << "' must be a JSON array"; throw std::runtime_error(ss.str()); } if (v.type() == json_spirit::array_type) { json_spirit::Array array = v.get_array(); + if (array.size() != 2) { + ss << "Object '" << object + << "' must be a JSON array with 2 elements"; + throw std::runtime_error(ss.str()); + } vector::iterator i = array.begin(); + //if (i == array.end() || i->type() != json_spirit::str_type) { if (i->type() != json_spirit::str_type) { - ss << "object '" << object - << "' must be a JSON array with the first element a string but " - << "found type " << v.type() << " instead"; + ss << "Object '" << object + << "' must be a JSON array with the first element a string"; throw std::runtime_error(ss.str()); } string object_pgidstr = i->get_str(); - spg_t object_pgid; - object_pgid.parse(object_pgidstr.c_str()); - if (pgidstr.length() > 0) { - if (object_pgid != pgid) { - ss << "object '" << object - << "' has a pgid different from the --pgid=" - << pgidstr << " option"; - throw std::runtime_error(ss.str()); + if (object_pgidstr != "meta") { + spg_t object_pgid; + object_pgid.parse(object_pgidstr.c_str()); + if (pgidstr.length() > 0) { + if (object_pgid != pgid) { + ss << "object '" << object + << "' has a pgid different from the --pgid=" + << pgidstr << " option"; + throw std::runtime_error(ss.str()); + } + } else { + pgidstr = object_pgidstr; + pgid = object_pgid; } - } else { - pgidstr = object_pgidstr; - pgid = object_pgid; - } + } else { + pgidstr = object_pgidstr; + } ++i; v = *i; } try { ghobj.decode(v); } catch (std::runtime_error& e) { - ss << "Decode object json error: " << e.what(); + ss << "Decode object JSON error: " << e.what(); throw std::runtime_error(ss.str()); } - if ((uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) { + if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) { cerr << "Object pool and pgid pool don't match" << std::endl; ret = 1; goto out; @@ -2658,13 +3518,18 @@ int main(int argc, char **argv) ret = 1; goto out; } + } } - if (op != "list" && op != "import" && op != "list-lost" && op != "fix-lost" - && op != "list-pgs" && op != "set-allow-sharded-objects" && - (pgidstr.length() == 0)) { + // The ops which require --pgid option are checked here and + // mentioned in the usage for --pgid. + if ((op == "info" || op == "log" || op == "remove" || op == "export" + || op == "rm-past-intervals" || op == "mark-complete") && + pgidstr.length() == 0) { cerr << "Must provide pgid" << std::endl; usage(desc); + ret = 1; + goto out; } if (op == "set-allow-sharded-objects") { @@ -2721,20 +3586,20 @@ int main(int argc, char **argv) goto out; } - superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); - ObjectStore::Transaction t; - bl.clear(); - ::encode(superblock, bl); - t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl); - r = fs->apply_transaction(t); - if (r < 0) { - cerr << "Error writing OSD superblock: " << cpp_strerror(r) << std::endl; - ret = 1; - goto out; - } - - fs->set_allow_sharded_objects(); + if (!dry_run) { + superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + ObjectStore::Transaction t; + bl.clear(); + ::encode(superblock, bl); + t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl); + ret = fs->apply_transaction(t); + if (ret < 0) { + cerr << "Error writing OSD superblock: " << cpp_strerror(ret) << std::endl; + goto out; + } + fs->set_allow_sharded_objects(); + } cout << "Enabled on-disk sharded objects" << std::endl; ret = 0; @@ -2753,84 +3618,147 @@ int main(int argc, char **argv) cerr << "Found incomplete transition to sharded objects" << std::endl; cerr << std::endl; cerr << "Use --op set-allow-sharded-objects to repair" << std::endl; - ret = EINVAL; + ret = -EINVAL; goto out; } if (op == "import") { try { - ret = do_import(fs, superblock); + ret = do_import(fs, superblock, force, pgidstr); } catch (const buffer::error &e) { cerr << "do_import threw exception error " << e.what() << std::endl; - ret = EFAULT; + ret = -EFAULT; } - if (ret == EFAULT) { + if (ret == -EFAULT) { cerr << "Corrupt input for import" << std::endl; } if (ret == 0) cout << "Import successful" << std::endl; goto out; + } else if (op == "dump-journal-mount") { + // Undocumented feature to dump journal with mounted fs + // This doesn't support the format option, but it uses the + // ObjectStore::dump_journal() and mounts to get replay to run. + ret = fs->dump_journal(cout); + if (ret) { + if (ret == -EOPNOTSUPP) { + cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl; + } else { + cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl; + } + } + goto out; + } else if (op == "get-osdmap") { + bufferlist bl; + OSDMap osdmap; + if (epoch == 0) { + epoch = superblock.current_epoch; + } + ret = get_osdmap(fs, epoch, osdmap, bl); + if (ret) { + cerr << "Failed to get osdmap#" << epoch << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = bl.write_fd(file_fd); + if (ret) { + cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl; + } else { + cout << "osdmap#" << epoch << " exported." << std::endl; + } + goto out; + } else if (op == "set-osdmap") { + bufferlist bl; + ret = get_fd_data(file_fd, bl); + if (ret < 0) { + cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl; + } else { + ret = set_osdmap(fs, epoch, bl, force); + } + goto out; + } else if (op == "get-inc-osdmap") { + bufferlist bl; + if (epoch == 0) { + epoch = superblock.current_epoch; + } + ret = get_inc_osdmap(fs, epoch, bl); + if (ret < 0) { + cerr << "Failed to get incremental osdmap# " << epoch << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = bl.write_fd(file_fd); + if (ret) { + cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl; + } else { + cout << "inc-osdmap#" << epoch << " exported." << std::endl; + } + goto out; + } else if (op == "set-inc-osdmap") { + bufferlist bl; + ret = get_fd_data(file_fd, bl); + if (ret < 0) { + cerr << "Failed to read incremental osdmap " << cpp_strerror(ret) << std::endl; + goto out; + } else { + ret = set_inc_osdmap(fs, epoch, bl, force); + } + goto out; } log_oid = OSD::make_pg_log_oid(pgid); biginfo_oid = OSD::make_pg_biginfo_oid(pgid); if (op == "remove") { - finish_remove_pgs(fs); - int r = initiate_new_remove_pg(fs, pgid); - if (r) { + ret = initiate_new_remove_pg(fs, pgid); + if (ret < 0) { cerr << "PG '" << pgid << "' not found" << std::endl; - ret = 1; goto out; } - finish_remove_pgs(fs); cout << "Remove successful" << std::endl; goto out; } - if (op == "list-lost" || op == "fix-lost") { + if (op == "fix-lost") { boost::scoped_ptr action; - if (op == "list-lost") - action.reset(new do_list_lost()); - if (op == "fix-lost") - action.reset(new do_fix_lost()); + action.reset(new do_fix_lost()); if (pgidstr.length()) - ret = action_on_all_objects_in_pg(fs, coll_t(pgid), *action, debug); + ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), *action, debug); else ret = action_on_all_objects(fs, *action, debug); goto out; } - // Special list handling. Treating pretty_format as human readable, - // with one object per line and not an enclosing array. - human_readable = ends_with(format, "-pretty"); - if (op == "list" && human_readable) { - // Remove -pretty from end of format which we know is there - format = format.substr(0, format.size() - strlen("-pretty")); + if (op == "list") { + ret = do_list(fs, pgidstr, object, formatter, debug, human_readable, head); + if (ret < 0) { + cerr << "do_list failed: " << cpp_strerror(ret) << std::endl; + } + goto out; } - formatter = Formatter::create(format); - if (formatter == NULL) { - cerr << "unrecognized format: " << format << std::endl; - ret = 1; + if (op == "dump-super") { + formatter->open_object_section("superblock"); + superblock.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; goto out; } - if (op == "list") { - r = do_list(fs, pgidstr, object, formatter, debug, human_readable); - if (r) { - cerr << "do_list failed with " << r << std::endl; - ret = 1; + if (op == "meta-list") { + ret = do_meta(fs, object, formatter, debug, human_readable); + if (ret < 0) { + cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl; } goto out; } - r = fs->list_collections(ls); - if (r < 0) { - cerr << "failed to list pgs: " << cpp_strerror(-r) << std::endl; - ret = 1; + ret = fs->list_collections(ls); + if (ret < 0) { + cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl; goto out; } @@ -2842,6 +3770,13 @@ int main(int argc, char **argv) snapid_t snap; spg_t tmppgid; + if (pgidstr == "meta") { + if (it->to_str() == "meta") + break; + else + continue; + } + if (!it->is_pg(tmppgid, snap)) { continue; } @@ -2853,8 +3788,9 @@ int main(int argc, char **argv) if (op != "list-pgs" && tmppgid != pgid) { continue; } - if (snap != CEPH_NOSNAP && debug) { - cout << "skipping snapped dir " << *it + if (snap != CEPH_NOSNAP) { + if (debug) + cerr << "skipping snapped dir " << *it << " (pg " << pgid << " snap " << snap << ")" << std::endl; continue; } @@ -2872,6 +3808,16 @@ int main(int argc, char **argv) goto out; } + // If not an object command nor any of the ops handled below, then output this usage + // before complaining about a bad pgid + if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") { + cerr << "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, " + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)" + << std::endl; + usage(desc); + ret = 1; + goto out; + } epoch_t map_epoch; // The following code for export, info, log require omap or !skip-mount-omap if (it != ls.end()) { @@ -2881,25 +3827,15 @@ int main(int argc, char **argv) if (vm.count("objcmd")) { ret = 0; if (objcmd == "remove") { - int r = do_remove_object(fs, coll, ghobj); - if (r) { - ret = 1; - } + ret = do_remove_object(fs, coll, ghobj); goto out; } else if (objcmd == "list-attrs") { - int r = do_list_attrs(fs, coll, ghobj); - if (r) { - ret = 1; - } + ret = do_list_attrs(fs, coll, ghobj); goto out; } else if (objcmd == "list-omap") { - int r = do_list_omap(fs, coll, ghobj); - if (r) { - ret = 1; - } + ret = do_list_omap(fs, coll, ghobj); goto out; } else if (objcmd == "get-bytes" || objcmd == "set-bytes") { - int r; if (objcmd == "get-bytes") { int fd; if (vm.count("arg1") == 0 || arg1 == "-") { @@ -2912,7 +3848,7 @@ int main(int argc, char **argv) goto out; } } - r = do_get_bytes(fs, coll, ghobj, fd); + ret = do_get_bytes(fs, coll, ghobj, fd); if (fd != STDOUT_FILENO) close(fd); } else { @@ -2933,23 +3869,24 @@ int main(int argc, char **argv) goto out; } } - r = do_set_bytes(fs, coll, ghobj, fd); + ret = do_set_bytes(fs, coll, ghobj, fd); if (fd != STDIN_FILENO) close(fd); } - if (r) - ret = 1; goto out; } else if (objcmd == "get-attr") { - if (vm.count("arg1") == 0) + if (vm.count("arg1") == 0) { usage(desc); - r = do_get_attr(fs, coll, ghobj, arg1); - if (r) - ret = 1; + ret = 1; + goto out; + } + ret = do_get_attr(fs, coll, ghobj, arg1); goto out; } else if (objcmd == "set-attr") { - if (vm.count("arg1") == 0) + if (vm.count("arg1") == 0) { usage(desc); + ret = 1; + } int fd; if (vm.count("arg2") == 0 || arg2 == "-") { @@ -2968,30 +3905,32 @@ int main(int argc, char **argv) goto out; } } - r = do_set_attr(fs, coll, ghobj, arg1, fd); + ret = do_set_attr(fs, coll, ghobj, arg1, fd); if (fd != STDIN_FILENO) close(fd); - if (r) - ret = 1; goto out; } else if (objcmd == "rm-attr") { - if (vm.count("arg1") == 0) + if (vm.count("arg1") == 0) { usage(desc); - r = do_rm_attr(fs, coll, ghobj, arg1); - if (r) - ret = 1; + ret = 1; + goto out; + } + ret = do_rm_attr(fs, coll, ghobj, arg1); goto out; } else if (objcmd == "get-omap") { - if (vm.count("arg1") == 0) + if (vm.count("arg1") == 0) { usage(desc); - r = do_get_omap(fs, coll, ghobj, arg1); - if (r) - ret = 1; + ret = 1; + goto out; + } + ret = do_get_omap(fs, coll, ghobj, arg1); goto out; } else if (objcmd == "set-omap") { - if (vm.count("arg1") == 0) + if (vm.count("arg1") == 0) { usage(desc); - + ret = 1; + goto out; + } int fd; if (vm.count("arg2") == 0 || arg2 == "-") { // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. @@ -3009,30 +3948,33 @@ int main(int argc, char **argv) goto out; } } - r = do_set_omap(fs, coll, ghobj, arg1, fd); + ret = do_set_omap(fs, coll, ghobj, arg1, fd); if (fd != STDIN_FILENO) close(fd); - if (r) - ret = 1; goto out; } else if (objcmd == "rm-omap") { - if (vm.count("arg1") == 0) + if (vm.count("arg1") == 0) { usage(desc); - r = do_rm_omap(fs, coll, ghobj, arg1); - if (r) - ret = 1; + ret = 1; + goto out; + } + ret = do_rm_omap(fs, coll, ghobj, arg1); goto out; } else if (objcmd == "get-omaphdr") { - if (vm.count("arg1")) + if (vm.count("arg1")) { usage(desc); - r = do_get_omaphdr(fs, coll, ghobj); - if (r) - ret = 1; + ret = 1; + goto out; + } + ret = do_get_omaphdr(fs, coll, ghobj); goto out; } else if (objcmd == "set-omaphdr") { // Extra arg - if (vm.count("arg2")) + if (vm.count("arg2")) { usage(desc); + ret = 1; + goto out; + } int fd; if (vm.count("arg1") == 0 || arg1 == "-") { // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. @@ -3050,21 +3992,75 @@ int main(int argc, char **argv) goto out; } } - r = do_set_omaphdr(fs, coll, ghobj, fd); + ret = do_set_omaphdr(fs, coll, ghobj, fd); if (fd != STDIN_FILENO) close(fd); - if (r) + goto out; + } else if (objcmd == "dump") { + // There should not be any other arguments + if (vm.count("arg1") || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + ret = print_obj_info(fs, coll, ghobj, formatter); + goto out; + } else if (objcmd == "set-size") { + // Extra arg + if (vm.count("arg1") == 0 || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) { + cerr << "Invalid size '" << arg1 << "' specified" << std::endl; + ret = 1; + goto out; + } + uint64_t size = atoll(arg1.c_str()); + ret = set_size(fs, coll, ghobj, size, formatter); + goto out; + } else if (objcmd == "clear-snapset") { + // UNDOCUMENTED: For testing zap SnapSet + // IGNORE extra args since not in usage anyway + if (!ghobj.hobj.has_snapset()) { + cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl; ret = 1; + goto out; + } + ret = clear_snapset(fs, coll, ghobj, arg1); goto out; + } else if (objcmd == "remove-clone-metadata") { + // Extra arg + if (vm.count("arg1") == 0 || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + if (!ghobj.hobj.has_snapset()) { + cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl; + ret = 1; + goto out; + } + if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) { + cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl; + ret = 1; + goto out; + } + snapid_t cloneid = atoi(arg1.c_str()); + ret = remove_clone(fs, coll, ghobj, cloneid, force); + goto out; } cerr << "Unknown object command '" << objcmd << "'" << std::endl; usage(desc); + ret = 1; + goto out; } bufferlist bl; map_epoch = 0; - r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl); - if (r < 0) + ret = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl); + if (ret < 0) cerr << "peek_map_epoch returns an error" << std::endl; if (debug) @@ -3073,16 +4069,15 @@ int main(int argc, char **argv) pg_info_t info(pgid); map past_intervals; __u8 struct_ver; - r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, + ret = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_ver); - if (r < 0) { - cerr << "read_info error " << cpp_strerror(-r) << std::endl; - ret = 1; + if (ret < 0) { + cerr << "read_info error " << cpp_strerror(ret) << std::endl; goto out; } if (struct_ver < PG::compat_struct_v) { cerr << "PG is too old to upgrade, use older Ceph version" << std::endl; - ret = 1; + ret = -EFAULT; goto out; } if (debug) @@ -3101,20 +4096,13 @@ int main(int argc, char **argv) } else if (op == "log") { PGLog::IndexedLog log; pg_missing_t missing; - ret = get_log(fs, struct_ver, coll, pgid, info, log, missing); - if (ret > 0) + map divergent_priors; + ret = get_log(fs, struct_ver, coll, pgid, info, log, missing, + divergent_priors); + if (ret < 0) goto out; - formatter->open_object_section("log"); - log.dump(formatter); - formatter->close_section(); - formatter->flush(cout); - cout << std::endl; - formatter->open_object_section("missing"); - missing.dump(formatter); - formatter->close_section(); - formatter->flush(cout); - cout << std::endl; + dump_log(formatter, cout, log, missing, divergent_priors); } else if (op == "rm-past-intervals") { ObjectStore::Transaction tran; ObjectStore::Transaction *t = &tran; @@ -3123,37 +4111,77 @@ int main(int argc, char **argv) cerr << "Can't remove past-intervals, version mismatch " << (int)struct_ver << " (pg) != " << (int)PG::cur_struct_v << " (tool)" << std::endl; - ret = 1; + ret = -EFAULT; goto out; } cout << "Remove past-intervals " << past_intervals << std::endl; past_intervals.clear(); + if (dry_run) { + ret = 0; + goto out; + } ret = write_info(*t, map_epoch, info, past_intervals); if (ret == 0) { fs->apply_transaction(*t); cout << "Removal succeeded" << std::endl; } + } else if (op == "mark-complete") { + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (struct_ver != PG::cur_struct_v) { + cerr << "Can't mark-complete, version mismatch " << (int)struct_ver + << " (pg) != " << (int)PG::cur_struct_v << " (tool)" + << std::endl; + ret = 1; + goto out; + } + + cout << "Marking complete " << std::endl; + + info.last_update = eversion_t(superblock.current_epoch, info.last_update.version + 1); + info.last_backfill = hobject_t::get_max(); + info.last_epoch_started = superblock.current_epoch; + info.history.last_epoch_started = superblock.current_epoch; + info.history.last_epoch_clean = superblock.current_epoch; + past_intervals.clear(); + + if (!dry_run) { + ret = write_info(*t, map_epoch, info, past_intervals); + if (ret != 0) + goto out; + fs->apply_transaction(*t); + } + cout << "Marking complete succeeded" << std::endl; } else { - cerr << "Must provide --op (info, log, remove, export, import, list, list-lost, fix-lost, list-pgs, rm-past-intervals)" - << std::endl; - usage(desc); + assert(!"Should have already checked for valid --op"); } } else { cerr << "PG '" << pgid << "' not found" << std::endl; - ret = 1; + ret = -ENOENT; } out: - if (fs->umount() < 0) { - cerr << "umount failed" << std::endl; - return 1; + int r = fs->umount(); + if (r < 0) { + cerr << "umount failed: " << cpp_strerror(r) << std::endl; + // If no previous error, then use umount() error + if (ret == 0) + ret = r; + } + + if (dry_run) { + // Export output can go to stdout, so put this message on stderr + if (op == "export") + cerr << "dry-run: Nothing changed" << std::endl; + else + cout << "dry-run: Nothing changed" << std::endl; } - // Check for -errno accidentally getting here if (ret < 0) ret = 1; - return ret; + myexit(ret); } diff --git a/ceph/src/tools/crushtool.cc b/ceph/src/tools/crushtool.cc index d576b5ef..a6107650 100644 --- a/ceph/src/tools/crushtool.cc +++ b/ceph/src/tools/crushtool.cc @@ -570,9 +570,7 @@ int main(int argc, const char **argv) } } if (tree) { - ostringstream oss; - crush.dump_tree(&oss, NULL); - dout(1) << "\n" << oss.str() << dendl; + crush.dump_tree(&cout, NULL); } if (compile) { diff --git a/ceph/src/tools/rados/rados.cc b/ceph/src/tools/rados/rados.cc index a48d7302..01152df7 100644 --- a/ceph/src/tools/rados/rados.cc +++ b/ceph/src/tools/rados/rados.cc @@ -107,7 +107,7 @@ void usage(ostream& out) " setomapheader \n" " tmap-to-omap convert tmap keys/values to omap\n" " watch add watcher on this object\n" -" notify notify wather of this object with message\n" +" notify notify watcher of this object with message\n" " listwatchers list the watchers of this object\n" " set-alloc-hint \n" " set allocation hint for an object\n" @@ -1821,11 +1821,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, } if (values.size() && values.begin()->first == key) { - cout << " (length " << values.begin()->second.length() << ") : "; if (!outfile.empty()) { cerr << "Writing to " << outfile << std::endl; dump_data(outfile, values.begin()->second); } else { + cout << "value (" << values.begin()->second.length() << " bytes) :\n"; values.begin()->second.hexdump(cout); cout << std::endl; } @@ -1875,7 +1875,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, // dump key in hex if it contains nonprintable characters if (std::count_if(it->first.begin(), it->first.end(), (int (*)(int))isprint) < (int)it->first.length()) { - cout << "key: (" << it->first.length() << " bytes):\n"; + cout << "key (" << it->first.length() << " bytes):\n"; bufferlist keybl; keybl.append(it->first); keybl.hexdump(cout); @@ -1883,7 +1883,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, cout << it->first; } cout << std::endl; - cout << "value: (" << it->second.length() << " bytes) :\n"; + cout << "value (" << it->second.length() << " bytes) :\n"; it->second.hexdump(cout); cout << std::endl; } diff --git a/debian/changelog b/debian/changelog index 9a2e0989..c3f3afb6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,143 +1,112 @@ -ceph (0.94.6-1~u14.04+mos1) mos9.0; urgency=low - - * New upstream bugfix only release - * Added patche which makes ceph-disk work with udev generated symlinks - * Added a patch to fix rados bench crash - * Adjust packaging: - - disable make check, it fails anyway since some necessary files are - missing in the upstream tarball - - skip build dependencies necessary for tests only (valgrind, virtualenv) - - don't install ceph-deploy manual page to avoid file conflicts with - ceph-deploy package - - -- Alexey Sheplyakov Wed, 24 Feb 2016 16:48:44 +0300 - -ceph (0.94.5-0u~u14.04+mos1) mos8.0; urgency=medium - - * Rebuild for Ubuntu 14.04 - - -- Alexey Sheplyakov Thu, 12 Nov 2015 12:49:24 +0300 - -ceph (0.94.5-0ubuntu1) xenial; urgency=medium - - * New upstream release (LP: #1512292): - - d/p/*: Refresh. - - d/p/ceph-radosgw-init.patch: Dropped, included upstream. - - d/*.symbols: Refresh. - * d/p/modules.patch: Add jerasure_neon and shec erasure coding plugins - to generate unversioned so's for plugin loading (LP: #1507244). - * d/rules: Ensure that any remaining versioned so's are dropped from - the packaging - this is all test code (LP: #1507244). - - -- James Page Mon, 02 Nov 2015 14:47:31 +0000 - -ceph (0.94.3-0ubuntu2) wily; urgency=medium - - * d/ceph.install: Drop ceph-deploy manpage from packaging, provided - by ceph-deploy itself (LP: #1475910). - - -- James Page Mon, 07 Sep 2015 14:42:03 +0100 - -ceph (0.94.3-0ubuntu1) wily; urgency=medium - - [ James Page ] - * New upstream point release (LP: #1492227): - - d/p/remove-unused-variable-ceph-bug-11576.patch: - Dropped, included upstream. - - d/p*: Refreshed. - - [ Liam Young ] - * d/p/ceph-radosgw-init.patch: Cherry pick patch from upstream VCS to - ensure that restarts of the radosgw wait an appropriate amount of time - for the existing daemon to shutdown (LP: #1477225). - - -- James Page Mon, 07 Sep 2015 12:23:50 +0100 - -ceph (0.94.2-0ubuntu3) wily; urgency=medium - - * Fix compile failure with boost 1.58 (LP: #1483403): - - src/mon/OSDMonitor.cc: remove unused variable (Ceph issue #11576) - - -- Tiago Stürmer Daitx Mon, 10 Aug 2015 18:36:48 -0300 - -ceph (0.94.2-0ubuntu2) wily; urgency=medium - - * No change rebuild for boost1.58/libstdc++6. - - -- Dimitri John Ledkov Sun, 02 Aug 2015 13:25:26 +0100 - -ceph (0.94.2-0ubuntu1) wily; urgency=medium - - * New upstream point release (LP: #1465553): - - d/p/*: Refreshed. - - -- James Page Tue, 16 Jun 2015 09:53:23 +0100 - -ceph (0.94.1-0ubuntu1) vivid; urgency=high - - * New upstream stable point release (LP: #1443821): - - Includes critical fix for communication from pre-0.94 clients - during cluster upgrades. - - -- James Page Tue, 14 Apr 2015 11:46:12 +0100 - -ceph (0.94-0ubuntu1) vivid; urgency=low - - * New upstream stable release 'Hammer' (LP: #1423601): - - d/p/*: Refresh. +ceph (0.94.9-1~u14.04+mos1) mos9.0; urgency=low + + * Package 0.94.9 upstream release, most notable bugfixes: + - monitor crashes on a command without a prefix, + http://tracker.ceph.com/issues/16297 + https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2016-5009 + - pool quota alarm is not in effect, + http://tracker.ceph.com/issues/15478 + - OSD reports ENOTEMPTY and crashes, + http://tracker.ceph.com/issues/14766 + - improved reweight-by-utilization to consider the least used OSDs first, + http://tracker.ceph.com/issues/15770 + - no Last-Modified, Content-Size and X-Object-Manifest headers if no + segments in DLO manifest, + http://tracker.ceph.com/issues/15812 + * Backported fix of "Data corruption using RBD with caching enabled", + http://tracker.ceph.com/issues/17545 + * Make a mixed 0.94.{6,9} cluster work properly. + BIG RED WARNING: + as a result mixing MOS OSDs with upstream ones version 0.94.7 and + newer will break your cluster. You have been warned. + * debian/patches: + - drop hammer-rbd-snap-rollback-restore-the-link-to-parent.patch, + included upstream + - drop patches/rgw-handle-errors-properly-during-GET-on-Swift-s-DLO.patch, + included upstream + - added Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch so + a mixed cluster consisting of 0.94.6 and 0.94.9 OSDs/monitors can + can work properly. + * Added missing build dependency on libboost-random-dev + + -- Alexey Sheplyakov Thu, 06 Oct 2016 19:50:27 +0300 + +ceph (0.94.7-1) experimental; urgency=low + + * Package 0.94.7 upstream release, most notable changes: + - librbd: possible QEMU deadlock after creating image snapshots, + http://tracker.ceph.com/issues/14988 + - librbd: flattening an rbd image with active IO can lead to hang, + http://tracker.ceph.com/issues/14092 + - osd: fixed corruption when min_read_recency_for_promote > 1, + http://tracker.ceph.com/issues/15171 + - mon: implement reweight-by-utilization feature, + https://github.com/ceph/ceph/pull/8026 + See http://ceph.com/releases/v0-94-7-hammer-released for more details + * debian/rules: set consistent hardening flags for compiler and linker + * debian/ceph.install: skip ceph-deploy.8 to avoid file conflict with + ceph-deploy + * debian/patches: + - drop ObjBencher-seq_read_bench-fix-locking-errors.patch, + included upstream + - keep ceph-disk-fix-symlinks-handling.patch, the problem won't be + fixed in upstream 0.94.x + - add rbd #14512 fix (data loss on clone, snapshot, rollback) + - added rgw #15812 fix (`No Last-Modified, Content-Size and + X-Object-Manifest headers if no segments in DLO manifest') + + -- Alexey Sheplyakov Wed, 18 May 2016 17:05:28 +0300 + +ceph (0.94.6-1) experimental; urgency=medium + + * New upstream bugfix release + * Refresh "rbd lazy umount before unmap" patch + * Added upstream patches which make ceph-disk work with udev + generated symlinks + * Added a patch to make rados bench usable + + -- Alexey Sheplyakov Mon, 29 Feb 2016 10:22:57 +0300 + +ceph (0.94.5-1) experimental; urgency=medium + + * [2d330d6] New upstream release: + - [1e93090] Drop patch for CVE-2015-5245, included upstream. + - [20adc7d] Refresh all other patches. + * [9255e5d] Ensure any erasure coding test libraries and dangling symlinks + are not included in the ceph package. + + -- James Page Mon, 09 Nov 2015 12:09:51 +0000 + +ceph (0.94.3-1) experimental; urgency=medium + + * [580fef] Imported Upstream version 0.94.3 (Closes: #777814, #795178) + * [536935] Add upstream patch to fix CVE-2015-5245 (Closes: #798567) + + -- Gaudenz Steinlin Fri, 18 Sep 2015 16:55:23 +0200 + +ceph (0.94.2-2) experimental; urgency=medium + + * Revert "Drop virtualenv BD, disable unit tests." + * Restore patches for test enablement. + * Display test-suite log output in the event of failures. + + -- James Page Mon, 20 Jul 2015 13:37:06 +0100 + +ceph (0.94.2-1) experimental; urgency=medium + + * Resync with Ubuntu, introducing Ceph Hammer stable release: - d/*.symbols: Update inline with upstream additions, use regex for ceph version symbol. - - -- James Page Wed, 08 Apr 2015 18:57:08 +0100 - -ceph (0.93-0ubuntu6) vivid; urgency=medium - - * d/control,rules,*.symbols: Disable lttng support until we can make - it play a bit nicer with libvirt and apparmor, drop associated - symbols (LP: #1432644). - - -- James Page Wed, 01 Apr 2015 10:37:03 +0100 - -ceph (0.93-0ubuntu5) vivid; urgency=medium - - * d/lib-systemd/system/ceph-create-keys.service: Automatically create - admin and bootstrap keys after ceph mon startup (LP: #1435450). - * d/p/vivid-does-systemd.patch: Ensure that disks prepared on vivid - or later use systemd for init (LP: #1435464). - * d/lib-systemd/system/*.service: Align nofile limits and restart config - with equivalent upstart configurations. - - -- James Page Tue, 24 Mar 2015 12:30:14 +0000 - -ceph (0.93-0ubuntu4) vivid; urgency=medium - - * d/p/fix-cycles-arch.patch: Skip initialization of cycles_per_sec - if rtdsc (or equivalent) is not supported (LP: #1432786). - - -- James Page Wed, 18 Mar 2015 14:44:39 +0000 - -ceph (0.93-0ubuntu3) vivid; urgency=medium - - * d/ceph{-common}.install,control: Move ceph_argparse.py down into - ceph-common package to fixup ceph cli usage/autopkgtest failure. - - -- James Page Sat, 14 Mar 2015 21:27:26 +0000 - -ceph (0.93-0ubuntu2) vivid; urgency=medium - - * d/p/fix-cycles-arch.patch: Expand highres cycles support to cover - PPC architectures, warn and default to return 0 for archs without - support, fixing FTBFS. - - -- James Page Fri, 13 Mar 2015 19:40:03 +0000 - -ceph (0.93-0ubuntu1) vivid; urgency=medium - - * New upstream release candidate for Hammer stable release (LP: #1423601). - - d/*.symbols: Refresh inline with upstream, removing common code - symbols which don't form part of the public API. - - d/p/*: Refresh and drop patches as required. - * Resync with upstream packaging changes and enable new features: + - d/lib-systemd/system/ceph-create-keys.service: Automatically create + admin and bootstrap keys after ceph mon startup. + - d/p/vivid-does-systemd.patch: Ensure that disks prepared on vivid + or later use systemd for init. + - d/lib-systemd/system/*.service: Align nofile limits and restart config + with equivalent upstart configurations. + - d/p/fix-cycles-arch.patch: Skip initialization of cycles_per_sec + if rtdsc (or equivalent) is not supported. + - d/ceph{-common}.install,control: Move ceph_argparse.py down into + ceph-common package to fixup ceph cli usage/autopkgtest failure. - d/control,ceph-common.install,librbd1.install: Move rbdnamer and associated udev rules into ceph-common package. - d/control,python-*: Split out rbd, rados and cephfs bindings into @@ -145,42 +114,71 @@ ceph (0.93-0ubuntu1) vivid; urgency=medium - d/control: Move python-flask dependency to ceph package, only required for REST API. - d/control: Use google-perftools on arm64. - - d/rules,control: Enable use of lttng for userspace tracing. - - -- James Page Fri, 13 Mar 2015 07:42:45 +0000 - -ceph (0.87-0ubuntu5) vivid; urgency=medium - - * d/p/fix-argparse-defaults.patch: Workaround behavioural change in - argparse set_defaults in python 2.7.9 (LP: #1413321). - * d/rules: Disable build and support for RocksDB over concerns around - performance > 1TB in size. - - -- James Page Thu, 22 Jan 2015 09:54:19 +0000 - -ceph (0.87-0ubuntu4) vivid; urgency=medium - - * d/p/ceph-osd-prestart-path.patch: Fixup path for ceph-osd upstart - configuration pre-start script. - - -- James Page Tue, 13 Jan 2015 12:33:49 +0000 - -ceph (0.87-0ubuntu3) vivid; urgency=medium - - * d/control: Re-order Recommends to prefer ntp over chrony for Ubuntu. + - d/control: Re-order Recommends to prefer ntp over chrony for Ubuntu. + - d/p/ceph-osd-prestart-path.patch: Fixup path for ceph-osd upstart + configuration pre-start script. + - d/p/fix-argparse-defaults.patch: Workaround behavioural change in + argparse set_defaults in python 2.7.9 + * New upstream point release: + - d/p/*: Refresh. + * d/p/use_system_jerasure.patch,d/control: Drop use of libjerasure + as the patch is intrusive and expensive to maintain; will revisit if + adopted upstream. - -- James Page Tue, 16 Dec 2014 14:59:31 +0000 + -- James Page Tue, 16 Jun 2015 11:31:05 +0100 -ceph (0.87-0ubuntu2) vivid; urgency=medium +ceph (0.87-2) experimental; urgency=low - * d/rules: Limit rocksdb support to x86 + armhf, fixing FTBFS on - unsupported and broken architectures. + * Team upload. - -- James Page Mon, 08 Dec 2014 12:36:51 +0000 + [ Gaudenz Steinlin ] + * README.Debian: added clarification about setting the hashpspool flag. + (Closes: #769596). -ceph (0.87-0ubuntu1) vivid; urgency=medium + [ James Page ] + * Added new "modules.patch" to mark new erasure coding libraries as + modules, wildcard install. [ Dmitry Smirnov ] + * Recommends: added "ntp" to list of time-daemon alternatives + (Closes: #767511). + * Introduced native systemd services (except "rbdmap"), (Closes: #769593). + * ceph-test: install forgotten files. + * Run post-build tests: + + updated "virtualenv-never-download.patch" to pass + "--system-site-packages" to virtualenv to prevent downloads. + + added new patches to disable network-dependent and failing tests. + * Patchworks: + - bug-9341.patch + + bug-10036.patch (to show OSD affinity in "ceph osd tree"). + Thanks, Mykola Golub. + + bug-10059.patch + + 0latest-giant.patch (Last-Update: 2014-11-15). + + sleep-recover.patch + + tests-disable.patch (to disable tests that need cluster). + + tests-disable-ceph-disk.patch + + use_system_gtest.patch (commented) + as first attempt to build with system "libgtest-dev". + + use_system_jerasure.patch + * Build-Depends: + + libjerasure-dev (>= 2.0.0-2~) + + virtualenv + + valgrind [amd64 armhf i386 powerpc] + * rules: pass "--without-lttng" to explicitly disable "lttng" to avoid + auto-enable if found. + * rules: disabled bundled RocksDB: + RocksDB suppose to improve performance of keyvaluestore OSDs but the + latter slow down to nearly unusable state when filled over 1 TiB even with + RocksDB. Moreover KV backend is experimental and super dangerous -- I lost + cluster due to OSD poisoning caused by KV OSD which was plugged only + during limited time. LevelDB is good enough, for now I see no reason to + use RocksDB especially considering that it is not packaged separately. + * Removed myself from Uploaders. + + -- Dmitry Smirnov Wed, 01 Apr 2015 11:47:38 +1100 + +ceph (0.87-1) experimental; urgency=medium + * New major upstream release [October 2014]. + new "libradosstriper*" binary packages. * Patchworks (removed old patches, refreshed remaining ones). @@ -197,13 +195,31 @@ ceph (0.87-0ubuntu1) vivid; urgency=medium * Build with "--with-babeltrace". * Build and statically link bundled RocksDB. - [ James Page ] - * d/control,rules: Disable test suite execution and drop BD's on - virtualenv and valgrind for Ubuntu. - * d/p/modules.patch,d/ceph.install: Mark new erasure coding libraries - as modules, install via wildcard. + -- Dmitry Smirnov Thu, 30 Oct 2014 12:43:49 +1100 + +ceph (0.80.9-2) unstable; urgency=medium + + * [70fc1d] Add NEWS entry about CRUSH issues fixed in 0.80.9 + * [f41bb6] Add NEWS entry about rbd backed filesystems and systemd + + -- Gaudenz Steinlin Tue, 05 May 2015 21:29:15 +0200 + +ceph (0.80.9-1) unstable; urgency=medium + + * [4b4e] Imported Upstream version 0.80.9 + * [7102] Remove patches firefly-latest and p2139 applied upstream + * [5869] Add myself to uploaders + + -- Gaudenz Steinlin Mon, 04 May 2015 08:49:37 +0200 + +ceph (0.80.7-2) unstable; urgency=medium + + * Team upload. + * Build-Depends: +libjerasure-dev (>= 2.0.0-2~) + * New patch to use system "jerasure" library instead of its bundled copy. + * Removed myself from Uploaders. - -- James Page Fri, 05 Dec 2014 13:40:46 +0000 + -- Dmitry Smirnov Thu, 11 Dec 2014 12:55:38 +1100 ceph (0.80.7-1) unstable; urgency=medium diff --git a/debian/control b/debian/control index cfc88a8b..7c35f759 100644 --- a/debian/control +++ b/debian/control @@ -1,9 +1,12 @@ Source: ceph Section: admin Priority: optional -Maintainer: MOS ceph team -XSBC-Original-Maintainer: Ceph Maintainer +Maintainer: Ceph Maintainers +Uploaders: Laszlo Boszormenyi (GCS) , + James Page Homepage: http://ceph.com/ +Vcs-Git: git://anonscm.debian.org/pkg-ceph/ceph.git +Vcs-Browser: http://anonscm.debian.org/gitweb/?p=pkg-ceph/ceph.git Build-Depends: debhelper (>= 9~), default-jdk, dh-autoreconf, @@ -19,6 +22,7 @@ Build-Depends: debhelper (>= 9~), libboost-program-options-dev (>= 1.54), libboost-system-dev (>= 1.54), libboost-thread-dev (>= 1.54), + libboost-random-dev (>= 1.54), libbabeltrace-ctf-dev, libbabeltrace-dev, libbz2-dev, diff --git a/debian/gbp.conf b/debian/gbp.conf index 458ab470..78f83742 100644 --- a/debian/gbp.conf +++ b/debian/gbp.conf @@ -3,5 +3,8 @@ debian-branch = hammer upstream-branch = upstream-hammer pristine-tar = True +[pq] +patch-numbers = False + [import-orig] filter = debian/* diff --git a/debian/gbp2mos.sh b/debian/gbp2mos.sh new file mode 100755 index 00000000..cd9200e1 --- /dev/null +++ b/debian/gbp2mos.sh @@ -0,0 +1,14 @@ +#!/bin/sh +set -e +# MOS packaging CI insists on keeping the source under $pkgname directory, +# and debianization files in debian. Moving around files manually is a bit +# error prone (it's easy to forget 'git add something'), hence this script. +MYDIR="${0%/*}" +cd ${MYDIR}/.. + +mkdir -p -m 755 ceph +git ls-files | grep -vE '^debian[/]' | xargs cp -a --parents --target-directory=ceph +git add ceph +git ls-files | grep -vE '^(debian|ceph)[/]' | xargs git rm -f -- +git commit -m 'Shuffle files for MOS CI' + diff --git a/debian/patches/ObjBencher-seq_read_bench-fix-locking-errors.patch b/debian/patches/ObjBencher-seq_read_bench-fix-locking-errors.patch deleted file mode 100644 index 430a92ea..00000000 --- a/debian/patches/ObjBencher-seq_read_bench-fix-locking-errors.patch +++ /dev/null @@ -1,50 +0,0 @@ -From: Alexey Sheplyakov -Date: Fri, 26 Feb 2016 15:01:11 +0300 -Subject: ObjBencher::seq_read_bench: fix locking errors - -- take a lock before completion_ret -- remove extraneous comparison: it's clearly misplaced (bad merge?) - and tries to unlock a Mutex twice in a row - -Fixes: #14873 - -Signed-off-by: Alexey Sheplyakov ---- - src/common/obj_bencher.cc | 11 ++--------- - 1 file changed, 2 insertions(+), 9 deletions(-) - -diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc -index db4fd8f..a196e83 100644 ---- a/src/common/obj_bencher.cc -+++ b/src/common/obj_bencher.cc -@@ -598,13 +598,13 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre - index[slot] = data.started; - lock.Unlock(); - completion_wait(slot); -+ lock.Lock(); - r = completion_ret(slot); - if (r < 0) { - cerr << "read got " << r << std::endl; - lock.Unlock(); - goto ERR; - } -- lock.Lock(); - total_latency += data.cur_latency; - if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency; - if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency; -@@ -624,14 +624,7 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre - lock.Lock(); - ++data.started; - ++data.in_flight; -- lock.Unlock(); -- if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) { -- cerr << name[slot] << " is not correct!" << std::endl; -- ++errors; -- } else { -- lock.Unlock(); -- } -- -+ lock.Unlock(); - name[slot] = newName; - } - diff --git a/debian/patches/ObjectCacher-fix-bh_read_finish-offset-logic.patch b/debian/patches/ObjectCacher-fix-bh_read_finish-offset-logic.patch new file mode 100644 index 00000000..6945c391 --- /dev/null +++ b/debian/patches/ObjectCacher-fix-bh_read_finish-offset-logic.patch @@ -0,0 +1,39 @@ +From: Greg Farnum +Date: Mon, 23 May 2016 15:14:21 -0700 +Subject: ObjectCacher: fix bh_read_finish offset logic + +If we have an incoming read split across multiple BufferHeads, we want to +line up the BufferHead's bl with the incoming OSDOp's bl at the right offset. We +were erroneously using this nonsense calculation (always equal to zero!) when +a much simpler comparison of the BufferHead's logical object offset to the +incoming OSDOp's logical offset will do the trick nicely. + +Fixes: http://tracker.ceph.com/issues/16002 + +Signed-off-by: Greg Farnum +(cherry picked from commit 9ec6e7f608608088d51e449c9d375844631dcdde) +--- + src/osdc/ObjectCacher.cc | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc +index b2c2572..cad168c 100644 +--- a/src/osdc/ObjectCacher.cc ++++ b/src/osdc/ObjectCacher.cc +@@ -787,7 +787,6 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid, + if (bh->error < 0) + err = bh->error; + +- loff_t oldpos = opos; + opos = bh->end(); + + if (r == -ENOENT) { +@@ -807,7 +806,7 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid, + mark_error(bh); + } else { + bh->bl.substr_of(bl, +- oldpos-bh->start(), ++ bh->start() - start, + bh->length()); + mark_clean(bh); + } diff --git a/debian/patches/Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch b/debian/patches/Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch new file mode 100644 index 00000000..4ae23d37 --- /dev/null +++ b/debian/patches/Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch @@ -0,0 +1,466 @@ +From: Alexey Sheplyakov +Date: Thu, 6 Oct 2016 19:13:00 +0300 +Subject: Remove HITSET_GMT related code so 0.94.{6,9} OSDs/mons can coexist + +Revert + - "osd: do not let OSD_HITSET_GMT reuse the feature bit" + - "osd/osd_types: encode pg_pool_t the old way" + - "osd: Decode use_gmt_hitset with a unique version" + - "mon: disable gmt_hitset if not supported" + - "mon: print use_gmt_hitset in "ceph osd pool get" + - "mon: add "ceph osd pool set $pool use_gmt_hitset true" cmd" + - "osd: use GMT time for the object name of hitsets" + +This reverts commits + - 7aec079f8a1bbe75625c438a17bb87e45398568e + - f8d2abd2e41c5dd04977f85cc1d6e65853c9a1b2 + - 370434136ef076c350db3db4fca6489f88f70453 + - 720a090eb67b3955b0cadb7633c5a28a934171a4 + - 64bca2a43b34b265621bad2ec1fb980217223847 + - 87df212cfca33efbbee6376f528cb7d4895d1dc0 + - 039240418060c9a49298dacc0478772334526dce + +Required to allow 0.94.6 OSDs and monitors to inter-operate with 0.94.9 ones. + +The commit 039240418060c9a49298dacc0478772334526dce which fixes bug #9732 +breaks upgrade from 0.94.6 (which is shipped with MOS 8.x and 9.[01]) to +newer versions, see http://tracker.ceph.com/issues/17386 for more details. +Since MOS does not use cache pools and having a wrong time zone would +cause multiple problems revert the above mentioned commit (along with +the ones trying to address the breakage it causes) so a mixed cluster +can work properly. +--- + src/common/config_opts.h | 1 - + src/include/ceph_features.h | 2 -- + src/mon/MonCommands.h | 2 +- + src/mon/OSDMonitor.cc | 38 -------------------- + src/osd/ReplicatedPG.cc | 27 ++++++-------- + src/osd/ReplicatedPG.h | 4 +-- + src/osd/osd_types.cc | 85 +++------------------------------------------ + src/osd/osd_types.h | 11 +++--- + 8 files changed, 20 insertions(+), 150 deletions(-) + +diff --git a/src/common/config_opts.h b/src/common/config_opts.h +index c55694e..e773300 100644 +--- a/src/common/config_opts.h ++++ b/src/common/config_opts.h +@@ -500,7 +500,6 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages + OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd + OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd + OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host +-OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it. + OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset + OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) + OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes +diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h +index 205e18f..781df1b 100644 +--- a/src/include/ceph_features.h ++++ b/src/include/ceph_features.h +@@ -64,7 +64,6 @@ + // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY + #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ + #define CEPH_FEATURE_MON_METADATA (1ULL<<50) +-#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) + /* ... */ + #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) + +@@ -152,7 +151,6 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { + CEPH_FEATURE_MDS_QUOTA | \ + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \ +- CEPH_FEATURE_OSD_HITSET_GMT | \ + CEPH_FEATURE_HAMMER_0_94_4 | \ + 0ULL) + +diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h +index 3f0dae8..a66cc55 100644 +--- a/src/mon/MonCommands.h ++++ b/src/mon/MonCommands.h +@@ -634,7 +634,7 @@ COMMAND("osd pool get " \ + "get pool parameter ", "osd", "r", "cli,rest") + COMMAND("osd pool set " \ + "name=pool,type=CephPoolname " \ +- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \ ++ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \ + "name=val,type=CephString " \ + "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \ + "set pool parameter to ", "osd", "rw", "cli,rest") +diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc +index a006dbd..afaea9f 100644 +--- a/src/mon/OSDMonitor.cc ++++ b/src/mon/OSDMonitor.cc +@@ -16,7 +16,6 @@ + * + */ + +-#include + #include + + #include "OSDMonitor.h" +@@ -1648,9 +1647,6 @@ void OSDMonitor::take_all_failures(list& ls) + failure_info.clear(); + } + +-static bool uses_gmt_hitset(const std::pair& pool) { +- return pool.second.use_gmt_hitset; +-} + + // boot -- + +@@ -1720,19 +1716,6 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m) + } + } + +- if (std::find_if(osdmap.get_pools().begin(), +- osdmap.get_pools().end(), +- uses_gmt_hitset) != osdmap.get_pools().end()) { +- assert(osdmap.get_num_up_osds() == 0 || +- osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT); +- if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) { +- dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at " +- << m->get_orig_source_inst() +- << " doesn't announce support -- ignore" << dendl; +- goto ignore; +- } +- } +- + // already booted? + if (osdmap.is_up(from) && + osdmap.get_inst(from) == m->get_orig_source_inst()) { +@@ -3174,7 +3157,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) + if (!p->is_tier() && + (var == "hit_set_type" || var == "hit_set_period" || + var == "hit_set_count" || var == "hit_set_fpp" || +- var == "use_gmt_hitset" || + var == "target_max_objects" || var == "target_max_bytes" || + var == "cache_target_full_ratio" || + var == "cache_target_dirty_ratio" || +@@ -3227,8 +3209,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) + BloomHitSet::Params *bloomp = static_cast(p->hit_set_params.impl.get()); + f->dump_float("hit_set_fpp", bloomp->get_fpp()); + } +- } else if (var == "use_gmt_hitset") { +- f->dump_bool("use_gmt_hitset", p->use_gmt_hitset); + } else if (var == "target_max_objects") { + f->dump_unsigned("target_max_objects", p->target_max_objects); + } else if (var == "target_max_bytes") { +@@ -3286,8 +3266,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) + } + BloomHitSet::Params *bloomp = static_cast(p->hit_set_params.impl.get()); + ss << "hit_set_fpp: " << bloomp->get_fpp(); +- } else if (var == "use_gmt_hitset") { +- ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n"; + } else if (var == "target_max_objects") { + ss << "target_max_objects: " << p->target_max_objects; + } else if (var == "target_max_bytes") { +@@ -4170,11 +4148,6 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, + pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE); + if (g_conf->osd_pool_default_flag_nosizechange) + pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE); +- if (g_conf->osd_pool_use_gmt_hitset && +- (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) +- pi->use_gmt_hitset = true; +- else +- pi->use_gmt_hitset = false; + + pi->size = size; + pi->min_size = min_size; +@@ -4518,17 +4491,6 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, + } + BloomHitSet::Params *bloomp = static_cast(p.hit_set_params.impl.get()); + bloomp->set_fpp(f); +- } else if (var == "use_gmt_hitset") { +- if (val == "true" || (interr.empty() && n == 1)) { +- if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) { +- ss << "not all OSDs support GMT hit set."; +- return -EINVAL; +- } +- p.use_gmt_hitset = true; +- } else { +- ss << "expecting value 'true' or '1'"; +- return -EINVAL; +- } + } else if (var == "debug_fake_ec_pool") { + if (val == "true" || (interr.empty() && n == 1)) { + p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL; +diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc +index 1676a3e..429f9de 100644 +--- a/src/osd/ReplicatedPG.cc ++++ b/src/osd/ReplicatedPG.cc +@@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) + p != info.hit_set.history.end(); + ++p) { + if (stamp >= p->begin && stamp <= p->end) { +- oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); ++ oid = get_hit_set_archive_object(p->begin, p->end); + break; + } + } +@@ -10177,19 +10177,10 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp) + return hoid; + } + +-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, +- utime_t end, +- bool using_gmt) ++hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end) + { + ostringstream ss; +- ss << "hit_set_" << info.pgid.pgid << "_archive_"; +- if (using_gmt) { +- start.gmtime(ss) << "_"; +- end.gmtime(ss); +- } else { +- start.localtime(ss) << "_"; +- end.localtime(ss); +- } ++ ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end; + hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", + info.pgid.ps(), info.pgid.pool(), + cct->_conf->osd_hit_set_namespace); +@@ -10326,7 +10317,7 @@ void ReplicatedPG::hit_set_persist() + for (list::iterator p = info.hit_set.history.begin(); + p != info.hit_set.history.end(); + ++p) { +- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); ++ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end); + + // Once we hit a degraded object just skip further trim + if (is_degraded_or_backfilling_object(aoid)) +@@ -10335,8 +10326,10 @@ void ReplicatedPG::hit_set_persist() + return; + } + +- oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset); ++ oid = get_hit_set_archive_object(start, now); + // If the current object is degraded we skip this persist request ++ if (is_degraded_or_backfilling_object(oid)) ++ return; + if (scrubber.write_blocked_by_scrub(oid)) + return; + +@@ -10427,7 +10420,7 @@ void ReplicatedPG::hit_set_persist() + + updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info); + hit_set_create(); +- updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset); ++ updated_hit_set_hist.current_info = pg_hit_set_info_t(); + updated_hit_set_hist.current_last_stamp = utime_t(); + + // fabricate an object_info_t and SnapSet +@@ -10490,7 +10483,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) + for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { + list::iterator p = updated_hit_set_hist.history.begin(); + assert(p != updated_hit_set_hist.history.end()); +- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); ++ hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + + assert(!is_degraded_or_backfilling_object(oid)); + +@@ -10775,7 +10768,7 @@ void ReplicatedPG::agent_load_hit_sets() + continue; + } + +- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); ++ hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + if (is_unreadable_object(oid)) { + dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; + break; +diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h +index 0894be6..c8ed4fc 100644 +--- a/src/osd/ReplicatedPG.h ++++ b/src/osd/ReplicatedPG.h +@@ -903,9 +903,7 @@ protected: + void hit_set_in_memory_trim(); ///< discard old in memory HitSets + + hobject_t get_hit_set_current_object(utime_t stamp); +- hobject_t get_hit_set_archive_object(utime_t start, +- utime_t end, +- bool using_gmt); ++ hobject_t get_hit_set_archive_object(utime_t start, utime_t end); + + // agent + boost::scoped_ptr agent_state; +diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc +index b13925c..f0126bc 100644 +--- a/src/osd/osd_types.cc ++++ b/src/osd/osd_types.cc +@@ -926,7 +926,6 @@ void pg_pool_t::dump(Formatter *f) const + f->close_section(); // hit_set_params + f->dump_unsigned("hit_set_period", hit_set_period); + f->dump_unsigned("hit_set_count", hit_set_count); +- f->dump_bool("use_gmt_hitset", use_gmt_hitset); + f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); + f->dump_unsigned("stripe_width", get_stripe_width()); + f->dump_unsigned("expected_num_objects", expected_num_objects); +@@ -1239,60 +1238,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const + return; + } + +- if ((features & CEPH_FEATURE_OSD_HITSET_GMT) == 0) { +- // CEPH_FEATURE_OSD_HITSET_GMT requires pg_pool_t v21 which has +- // use_gmt_hitset, and two fields added before v21. it's backward +- // compatible, but re-encoding the same osdmap with different ceph +- // versions causes CRC mismatch at the OSD side, the tracker#12410 +- // prevents the monitor from sending the single full map requested +- // by OSD. so we need a way to encode pg_pool_t the same old way. +- ENCODE_START(17, 5, bl); +- ::encode(type, bl); +- ::encode(size, bl); +- ::encode(crush_ruleset, bl); +- ::encode(object_hash, bl); +- ::encode(pg_num, bl); +- ::encode(pgp_num, bl); +- __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. +- ::encode(lpg_num, bl); +- ::encode(lpgp_num, bl); +- ::encode(last_change, bl); +- ::encode(snap_seq, bl); +- ::encode(snap_epoch, bl); +- ::encode(snaps, bl, features); +- ::encode(removed_snaps, bl); +- ::encode(auid, bl); +- ::encode(flags, bl); +- ::encode(crash_replay_interval, bl); +- ::encode(min_size, bl); +- ::encode(quota_max_bytes, bl); +- ::encode(quota_max_objects, bl); +- ::encode(tiers, bl); +- ::encode(tier_of, bl); +- __u8 c = cache_mode; +- ::encode(c, bl); +- ::encode(read_tier, bl); +- ::encode(write_tier, bl); +- ::encode(properties, bl); +- ::encode(hit_set_params, bl); +- ::encode(hit_set_period, bl); +- ::encode(hit_set_count, bl); +- ::encode(stripe_width, bl); +- ::encode(target_max_bytes, bl); +- ::encode(target_max_objects, bl); +- ::encode(cache_target_dirty_ratio_micro, bl); +- ::encode(cache_target_full_ratio_micro, bl); +- ::encode(cache_min_flush_age, bl); +- ::encode(cache_min_evict_age, bl); +- ::encode(erasure_code_profile, bl); +- ::encode(last_force_op_resend, bl); +- ::encode(min_read_recency_for_promote, bl); +- ::encode(expected_num_objects, bl); +- ENCODE_FINISH(bl); +- return; +- } +- +- ENCODE_START(21, 5, bl); ++ ENCODE_START(17, 5, bl); + ::encode(type, bl); + ::encode(size, bl); + ::encode(crush_ruleset, bl); +@@ -1334,15 +1280,12 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const + ::encode(last_force_op_resend, bl); + ::encode(min_read_recency_for_promote, bl); + ::encode(expected_num_objects, bl); +- ::encode(uint32_t(.6 * 1e6), bl); +- ::encode(uint32_t(1), bl); +- ::encode(use_gmt_hitset, bl); + ENCODE_FINISH(bl); + } + + void pg_pool_t::decode(bufferlist::iterator& bl) + { +- DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl); ++ DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl); + ::decode(type, bl); + ::decode(size, bl); + ::decode(crush_ruleset, bl); +@@ -1454,19 +1397,6 @@ void pg_pool_t::decode(bufferlist::iterator& bl) + } else { + expected_num_objects = 0; + } +- if (struct_v >= 19) { +- uint32_t dummy; +- ::decode(dummy, bl); +- } +- if (struct_v >= 20) { +- uint32_t dummy; +- ::decode(dummy, bl); +- } +- if (struct_v >= 21) { +- ::decode(use_gmt_hitset, bl); +- } else { +- use_gmt_hitset = false; +- } + DECODE_FINISH(bl); + calc_pg_masks(); + } +@@ -3866,25 +3796,19 @@ void pg_create_t::generate_test_instances(list& o) + + void pg_hit_set_info_t::encode(bufferlist& bl) const + { +- ENCODE_START(2, 1, bl); ++ ENCODE_START(1, 1, bl); + ::encode(begin, bl); + ::encode(end, bl); + ::encode(version, bl); +- ::encode(using_gmt, bl); + ENCODE_FINISH(bl); + } + + void pg_hit_set_info_t::decode(bufferlist::iterator& p) + { +- DECODE_START(2, p); ++ DECODE_START(1, p); + ::decode(begin, p); + ::decode(end, p); + ::decode(version, p); +- if (struct_v >= 2) { +- ::decode(using_gmt, p); +- } else { +- using_gmt = false; +- } + DECODE_FINISH(p); + } + +@@ -3893,7 +3817,6 @@ void pg_hit_set_info_t::dump(Formatter *f) const + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->dump_stream("version") << version; +- f->dump_stream("using_gmt") << using_gmt; + } + + void pg_hit_set_info_t::generate_test_instances(list& ls) +diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h +index 92f6163..7557494 100644 +--- a/src/osd/osd_types.h ++++ b/src/osd/osd_types.h +@@ -1035,7 +1035,6 @@ public: + HitSet::Params hit_set_params; ///< The HitSet params to use on this pool + uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds) + uint32_t hit_set_count; ///< number of periods to retain +- bool use_gmt_hitset; ///< use gmt to name the hitset archive object + uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote + + uint32_t stripe_width; ///< erasure coded stripe size in bytes +@@ -1064,7 +1063,6 @@ public: + hit_set_params(), + hit_set_period(0), + hit_set_count(0), +- use_gmt_hitset(true), + min_read_recency_for_promote(0), + stripe_width(0), + expected_num_objects(0) +@@ -1602,11 +1600,10 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) + struct pg_hit_set_info_t { + utime_t begin, end; ///< time interval + eversion_t version; ///< version this HitSet object was written +- bool using_gmt; ///< use gmt for creating the hit_set archive object name +- pg_hit_set_info_t(bool using_gmt = true) +- : using_gmt(using_gmt) {} +- pg_hit_set_info_t(utime_t b, bool using_gmt) +- : begin(b), using_gmt(using_gmt) {} ++ ++ pg_hit_set_info_t() {} ++ pg_hit_set_info_t(utime_t b) ++ : begin(b) {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); diff --git a/debian/patches/ceph-disk-fix-symlinks-handling.patch b/debian/patches/ceph-disk-fix-symlinks-handling.patch index 341dc023..7c6d0e50 100644 --- a/debian/patches/ceph-disk-fix-symlinks-handling.patch +++ b/debian/patches/ceph-disk-fix-symlinks-handling.patch @@ -22,7 +22,7 @@ Signed-off-by: Alexey Sheplyakov 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk -index 7620ff8..49bc978 100755 +index 0525945..0e12e04 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -88,6 +88,7 @@ DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be' diff --git a/debian/patches/series b/debian/patches/series index 91f94c4b..27b43707 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -21,4 +21,5 @@ vivid-does-systemd.patch ## MOS ceph-disk-fix-symlinks-handling.patch -ObjBencher-seq_read_bench-fix-locking-errors.patch +ObjectCacher-fix-bh_read_finish-offset-logic.patch +Remove-HITSET_GMT-related-code-so-0.94.-6-9-OSDs-mon.patch diff --git a/debian/pkgbuild.sh b/debian/pkgbuild.sh new file mode 100755 index 00000000..edb24030 --- /dev/null +++ b/debian/pkgbuild.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -e +repo=/srv/data/Public/repos/ceph +dist=trusty +ceph_release=hammer +export_dir="../build-pkg-ceph-${ceph_release}-${dist}" +if [ ! -d "$export_dir" ]; then mkdir -p "$export_dir"; fi + +exec gbp buildpackage \ + --git-ignore-new \ + --git-pristine-tar \ + --git-pristine-tar-commit \ + --git-export-dir="$export_dir" \ + --git-cleaner='git clean -dfx' \ + --git-builder="sbuild -v --dist=${dist} --post-build-commands \"reprepro -Vb${repo} --ignore=wrongdistribution --ignore=missingfile include ${ceph_release}-${dist} %SBUILD_CHANGES\"" \ + $@ diff --git a/debian/rules b/debian/rules index 301b56e4..d62ba6ee 100755 --- a/debian/rules +++ b/debian/rules @@ -7,6 +7,8 @@ export DEB_LDFLAGS_MAINT_APPEND= -Wl,--as-needed # Enable hardening export DEB_BUILD_MAINT_OPTIONS = hardening=+all +DPKG_EXPORT_BUILDFLAGS = 1 +include /usr/share/dpkg/buildflags.mk export DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)