From: Sage Weil Date: Wed, 10 Dec 2014 00:00:56 +0000 (-0800) Subject: osd: use -1 for deep scrub digest seed on new OSDs X-Git-Tag: v0.92~111^2~15 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7d73f41f0d6ae2bf971c72b0b8aca6579e1c0d62;p=ceph.git osd: use -1 for deep scrub digest seed on new OSDs 0 is a weak initial value for a CRC since it doesn't change with a sequence of 0 bytes (which are relatively common). -1 is better. Use -1 when everyone in the acting set supports it. Signed-off-by: Sage Weil --- diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 006a88adc155..151e547780eb 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -55,6 +55,7 @@ #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44) #define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) #define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) +#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */ #define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) #define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */ @@ -134,7 +135,8 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_OSD_POOLRESEND | \ CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 | \ CEPH_FEATURE_OSD_SET_ALLOC_HINT | \ - CEPH_FEATURE_OSD_FADVISE_FLAGS | \ + CEPH_FEATURE_OSD_FADVISE_FLAGS | \ + CEPH_FEATURE_OSD_OBJECT_DIGEST | \ CEPH_FEATURE_MDS_QUOTA | \ 0ULL) diff --git a/src/messages/MOSDRepScrub.h b/src/messages/MOSDRepScrub.h index 52a03b849109..5d0a6041c2fc 100644 --- a/src/messages/MOSDRepScrub.h +++ b/src/messages/MOSDRepScrub.h @@ -24,7 +24,7 @@ struct MOSDRepScrub : public Message { - static const int HEAD_VERSION = 5; + static const int HEAD_VERSION = 6; static const int COMPAT_VERSION = 2; spg_t pgid; // PG to scrub @@ -35,10 +35,13 @@ struct MOSDRepScrub : public Message { hobject_t start; // lower bound of scrub, inclusive hobject_t end; // upper bound of scrub, exclusive bool deep; // true if scrub should be deep + uint32_t seed; // seed value for digest calculation - MOSDRepScrub() : Message(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION), + MOSDRepScrub() + : Message(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION), chunky(false), - deep(false) { } + deep(false), + seed(0) { } MOSDRepScrub(spg_t pgid, eversion_t scrub_from, eversion_t scrub_to, epoch_t map_epoch) @@ -48,10 +51,11 @@ struct MOSDRepScrub : public Message { scrub_to(scrub_to), map_epoch(map_epoch), chunky(false), - deep(false) { } + deep(false), + seed(0) { } MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch, - hobject_t start, hobject_t end, bool deep) + hobject_t start, hobject_t end, bool deep, uint32_t seed) : Message(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION), pgid(pgid), scrub_to(scrub_to), @@ -59,7 +63,8 @@ struct MOSDRepScrub : public Message { chunky(true), start(start), end(end), - deep(deep) { } + deep(deep), + seed(seed) { } private: @@ -73,6 +78,7 @@ public: << ",epoch:" << map_epoch << ",start:" << start << ",end:" << end << ",chunky:" << chunky << ",deep:" << deep + << ",seed:" << seed << ",version:" << header.version; out << ")"; } @@ -87,6 +93,7 @@ public: ::encode(end, payload); ::encode(deep, payload); ::encode(pgid.shard, payload); + ::encode(seed, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); @@ -114,6 +121,11 @@ public: } else { pgid.shard = shard_id_t::NO_SHARD; } + if (header.version >= 6) { + ::decode(seed, p); + } else { + seed = 0; + } } }; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 4031b4c671ed..9a5a6a1a0e3b 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1746,9 +1746,10 @@ void ECBackend::rollback_append( void ECBackend::be_deep_scrub( const hobject_t &poid, + uint32_t seed, ScrubMap::object &o, ThreadPool::TPHandle &handle) { - bufferhash h(-1); + bufferhash h(-1); // we always used -1 int r; uint64_t stride = cct->_conf->osd_deep_scrub_stride; if (stride % sinfo.get_chunk_size()) diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 147e3e85d1db..d13d8bb48575 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -469,6 +469,7 @@ public: void be_deep_scrub( const hobject_t &obj, + uint32_t seed, ScrubMap::object &o, ThreadPool::TPHandle &handle); uint64_t be_get_ondisk_size(uint64_t logical_size) { diff --git a/src/osd/PG.cc b/src/osd/PG.cc index eea7ffd323df..71be9c8a3ed0 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3288,14 +3288,15 @@ void PG::_request_scrub_map_classic(pg_shard_t replica, eversion_t version) void PG::_request_scrub_map( pg_shard_t replica, eversion_t version, hobject_t start, hobject_t end, - bool deep) + bool deep, uint32_t seed) { assert(replica != pg_whoami); - dout(10) << "scrub requesting scrubmap from osd." << replica << dendl; + dout(10) << "scrub requesting scrubmap from osd." << replica + << " deep " << (int)deep << " seed " << seed << dendl; MOSDRepScrub *repscrubop = new MOSDRepScrub( spg_t(info.pgid.pgid, replica.shard), version, get_osdmap()->get_epoch(), - start, end, deep); + start, end, deep, seed); osd->send_message_osd_cluster( replica.osd, repscrubop, get_osdmap()->get_epoch()); } @@ -3561,10 +3562,11 @@ void PG::_scan_snaps(ScrubMap &smap) */ int PG::build_scrub_map_chunk( ScrubMap &map, - hobject_t start, hobject_t end, bool deep, + hobject_t start, hobject_t end, bool deep, uint32_t seed, ThreadPool::TPHandle &handle) { - dout(10) << __func__ << " [" << start << "," << end << ")" << dendl; + dout(10) << __func__ << " [" << start << "," << end << ") " + << " seed " << seed << dendl; map.valid_through = info.last_update; @@ -3583,7 +3585,7 @@ int PG::build_scrub_map_chunk( } - get_pgbackend()->be_scan_list(map, ls, deep, handle); + get_pgbackend()->be_scan_list(map, ls, deep, seed, handle); _scan_rollback_obs(rollback_obs, handle); _scan_snaps(map); @@ -3612,7 +3614,7 @@ void PG::build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle) vector ls; osd->store->collection_list(coll, ls); - get_pgbackend()->be_scan_list(map, ls, false, handle); + get_pgbackend()->be_scan_list(map, ls, false, 0, handle); lock(); _scan_snaps(map); @@ -3658,7 +3660,7 @@ void PG::build_inc_scrub_map( } } - get_pgbackend()->be_scan_list(map, ls, false, handle); + get_pgbackend()->be_scan_list(map, ls, false, 0, handle); } void PG::repair_object( @@ -3723,7 +3725,7 @@ void PG::replica_scrub( } build_scrub_map_chunk( - map, msg->start, msg->end, msg->deep, + map, msg->start, msg->end, msg->deep, msg->seed, handle); vector scrub(1); @@ -3926,6 +3928,12 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) oss << info.pgid.pgid << " " << mode << " starts" << std::endl; osd->clog->info(oss); } + + if (peer_features & CEPH_FEATURE_OSD_OBJECT_DIGEST) + scrubber.seed = -1; // better, and enables oi digest checks + else + scrubber.seed = 0; // compat + break; case PG::Scrubber::NEW_CHUNK: @@ -4007,7 +4015,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) ++i) { if (*i == pg_whoami) continue; _request_scrub_map(*i, scrubber.subset_last_update, - scrubber.start, scrubber.end, scrubber.deep); + scrubber.start, scrubber.end, scrubber.deep, + scrubber.seed); scrubber.waiting_on_whom.insert(*i); ++scrubber.waiting_on; } @@ -4041,7 +4050,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) // build my own scrub map ret = build_scrub_map_chunk(scrubber.primary_scrubmap, scrubber.start, scrubber.end, - scrubber.deep, + scrubber.deep, scrubber.seed, handle); if (ret < 0) { dout(5) << "error building scrub map: " << ret << ", aborting" << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index 21fed9c2f785..25ee5cd6c977 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1030,7 +1030,8 @@ public: active_rep_scrub(0), must_scrub(false), must_deep_scrub(false), must_repair(false), state(INACTIVE), - deep(false) + deep(false), + seed(0) { } @@ -1081,6 +1082,7 @@ public: // deep scrub bool deep; + uint32_t seed; list callbacks; void add_callback(Context *context) { @@ -1151,6 +1153,7 @@ public: deep_errors = 0; fixed = 0; deep = false; + seed = 0; run_callbacks(); inconsistent.clear(); missing.clear(); @@ -1183,10 +1186,11 @@ public: ThreadPool::TPHandle &handle); void _request_scrub_map_classic(pg_shard_t replica, eversion_t version); void _request_scrub_map(pg_shard_t replica, eversion_t version, - hobject_t start, hobject_t end, bool deep); + hobject_t start, hobject_t end, bool deep, + uint32_t seed); int build_scrub_map_chunk( ScrubMap &map, - hobject_t start, hobject_t end, bool deep, + hobject_t start, hobject_t end, bool deep, uint32_t seed, ThreadPool::TPHandle &handle); void build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle); void build_inc_scrub_map( diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index 4c4787cc5058..60cc40c9afb1 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -320,7 +320,7 @@ PGBackend *PGBackend::build_pg_backend( * pg lock may or may not be held */ void PGBackend::be_scan_list( - ScrubMap &map, const vector &ls, bool deep, + ScrubMap &map, const vector &ls, bool deep, uint32_t seed, ThreadPool::TPHandle &handle) { dout(10) << __func__ << " scanning " << ls.size() << " objects" @@ -351,7 +351,7 @@ void PGBackend::be_scan_list( // calculate the CRC32 on deep scrubs if (deep) { - be_deep_scrub(*p, o, handle); + be_deep_scrub(*p, seed, o, handle); } dout(25) << __func__ << " " << poid << dendl; diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 3b9f25323560..c829b9468a61 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -585,7 +585,7 @@ virtual bool scrub_supported() { return false; } void be_scan_list( - ScrubMap &map, const vector &ls, bool deep, + ScrubMap &map, const vector &ls, bool deep, uint32_t seed, ThreadPool::TPHandle &handle); enum scrub_error_type be_compare_scrub_objects( const ScrubMap::object &auth, @@ -607,6 +607,7 @@ uint64_t logical_size) { assert(0); return 0; } virtual void be_deep_scrub( const hobject_t &poid, + uint32_t seed, ScrubMap::object &o, ThreadPool::TPHandle &handle) { assert(0); } diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index e19ad4b0465e..e646b2e4ff15 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -693,10 +693,12 @@ void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op) void ReplicatedBackend::be_deep_scrub( const hobject_t &poid, + uint32_t seed, ScrubMap::object &o, ThreadPool::TPHandle &handle) { - bufferhash h, oh; + dout(10) << __func__ << " " << poid << " seed " << seed << dendl; + bufferhash h(seed), oh(seed); bufferlist bl, hdrbl; int r; __u64 pos = 0; @@ -726,12 +728,19 @@ void ReplicatedBackend::be_deep_scrub( ghobject_t( poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), &hdrbl, true); - if (r == 0) { + // NOTE: bobtail to giant, we would crc the head as (len, head). + // that changes at the same time we start using a non-zero seed. + if (r == 0 && hdrbl.length()) { dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length()) << dendl; - ::encode(hdrbl, bl); - oh << bl; - bl.clear(); + if (seed == 0) { + // legacy + bufferlist bl; + ::encode(hdrbl, bl); + oh << bl; + } else { + oh << hdrbl; + } } else if (r == -EIO) { dout(25) << __func__ << " " << poid << " got " << r << " on omap header read, read_error" << dendl; diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 67a4a1f7ffbf..927ebb87a270 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -413,6 +413,7 @@ private: void be_deep_scrub( const hobject_t &obj, + uint32_t seed, ScrubMap::object &o, ThreadPool::TPHandle &handle); uint64_t be_get_ondisk_size(uint64_t logical_size) { return logical_size; }