From: Samuel Just Date: Wed, 28 Sep 2016 20:24:56 +0000 (-0700) Subject: src/osd: relax the requirement that we scrub a whole hash value X-Git-Tag: v10.2.4~78^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=16f9d95204f3eca6908ae241e5bacc48b006cb23;p=ceph.git src/osd: relax the requirement that we scrub a whole hash value Previously, we needed to scrub all objects in clones in a single hash value mainly to ensure that _scrub had access to all clones of a single object at the same time. Instead, just avoid letting head or snapdir be a boundary (see the comment in the commit for details). Signed-off-by: Samuel Just (cherry picked from commit 27bdc8ce6d0a7d8ae47f29540f281ba417e16b4c) --- diff --git a/src/common/hobject.h b/src/common/hobject.h index 77f2081b720..9be84c5000a 100644 --- a/src/common/hobject.h +++ b/src/common/hobject.h @@ -138,6 +138,14 @@ public: return ret; } + hobject_t get_object_boundary() const { + if (is_max()) + return *this; + hobject_t ret = *this; + ret.snap = 0; + return ret; + } + /// @return head version of this hobject_t hobject_t get_head() const { hobject_t ret(*this); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 69331a5243a..597f456fadd 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -4118,54 +4118,54 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) scrubber.received_maps.clear(); { - hobject_t candidate_end; - - // get the start and end of our scrub chunk - // - // start and end need to lie on a hash boundary. We test for this by - // requesting a list and searching backward from the end looking for a - // boundary. If there's no boundary, we request a list after the first - // list, and so forth. - - bool boundary_found = false; + /* get the start and end of our scrub chunk + * + * Our scrub chunk has an important restriction we're going to need to + * respect. We can't let head or snapdir be start or end. + * Using a half-open interval means that if end == head|snapdir, + * we'd scrub/lock head and the clone right next to head in different + * chunks which would allow us to miss clones created between + * scrubbing that chunk and scrubbing the chunk including head. + * This isn't true for any of the other clones since clones can + * only be created "just to the left of" head. There is one exception + * to this: promotion of clones which always happens to the left of the + * left-most clone, but promote_object checks the scrubber in that + * case, so it should be ok. Also, it's ok to "miss" clones at the + * left end of the range if we are a tier because they may legitimately + * not exist (see _scrub). + */ + unsigned min = MAX(3, cct->_conf->osd_scrub_chunk_min); hobject_t start = scrubber.start; - unsigned loop = 0; - while (!boundary_found) { - vector objects; - ret = get_pgbackend()->objects_list_partial( - start, - cct->_conf->osd_scrub_chunk_min, - cct->_conf->osd_scrub_chunk_max, - &objects, - &candidate_end); - assert(ret >= 0); - - // in case we don't find a boundary: start again at the end - start = candidate_end; - - // special case: reached end of file store, implicitly a boundary - if (objects.empty()) { - break; - } - - // search backward from the end looking for a boundary - objects.push_back(candidate_end); - while (!boundary_found && objects.size() > 1) { - hobject_t end = objects.back().get_boundary(); - objects.pop_back(); - - if (objects.back().get_hash() != end.get_hash()) { - candidate_end = end; - boundary_found = true; - } - } - - // reset handle once in a while, the search maybe takes long. - if (++loop >= g_conf->osd_loop_before_reset_tphandle) { - handle.reset_tp_timeout(); - loop = 0; - } - } + hobject_t candidate_end; + vector objects; + ret = get_pgbackend()->objects_list_partial( + start, + min, + MAX(min, cct->_conf->osd_scrub_chunk_max), + &objects, + &candidate_end); + assert(ret >= 0); + + if (!objects.empty()) { + hobject_t back = objects.back(); + while (candidate_end.has_snapset() && + candidate_end.get_head() == back.get_head()) { + candidate_end = back; + objects.pop_back(); + if (objects.empty()) { + assert(0 == + "Somehow we got more than 2 objects which" + "have the same head but are not clones"); + } + back = objects.back(); + } + if (candidate_end.has_snapset()) { + assert(candidate_end.get_head() != back.get_head()); + candidate_end = candidate_end.get_object_boundary(); + } + } else { + assert(candidate_end.is_max()); + } if (!_range_available_for_scrub(scrubber.start, candidate_end)) { // we'll be requeued by whatever made us unavailable for scrub @@ -4190,7 +4190,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) } } - // ask replicas to wait until last_update_applied >= scrubber.subset_last_update and then scan + // ask replicas to wait until + // last_update_applied >= scrubber.subset_last_update and then scan scrubber.waiting_on_whom.insert(pg_whoami); ++scrubber.waiting_on; @@ -4337,7 +4338,7 @@ void PG::scrub_compare_maps() dout(10) << __func__ << " has maps, analyzing" << dendl; // construct authoritative scrub map for type specific scrubbing - ScrubMap authmap(scrubber.primary_scrubmap); + scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap); map, hobject_t::BitwiseComparator> missing_digest; if (acting.size() > 1) { @@ -4399,13 +4400,34 @@ void PG::scrub_compare_maps() for (map, hobject_t::BitwiseComparator>::iterator i = authoritative.begin(); i != authoritative.end(); ++i) { - authmap.objects.erase(i->first); - authmap.objects.insert(*(maps[i->second.back()]->objects.find(i->first))); + scrubber.cleaned_meta_map.objects.erase(i->first); + scrubber.cleaned_meta_map.objects.insert( + *(maps[i->second.back()]->objects.find(i->first)) + ); + } + } + + ScrubMap for_meta_scrub; + if (scrubber.end.is_max() || + scrubber.cleaned_meta_map.objects.empty()) { + scrubber.cleaned_meta_map.swap(for_meta_scrub); + } else { + auto iter = scrubber.cleaned_meta_map.objects.end(); + --iter; // not empty, see if clause + auto begin = scrubber.cleaned_meta_map.objects.begin(); + while (iter != begin) { + auto next = iter--; + if (next->first.get_head() != iter->first.get_head()) { + ++iter; + break; + } } + for_meta_scrub.objects.insert(begin, iter); + scrubber.cleaned_meta_map.objects.erase(begin, iter); } // ok, do the pg-type specific scrubbing - _scrub(authmap, missing_digest); + _scrub(for_meta_scrub, missing_digest); if (!scrubber.store->empty()) { if (state_test(PG_STATE_REPAIR)) { dout(10) << __func__ << ": discarding scrub results" << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index 10a81f8336f..cda845f68f1 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1169,6 +1169,9 @@ public: // Map from object with errors to good peers map >, hobject_t::BitwiseComparator> authoritative; + // Cleaned map pending snap metadata scrub + ScrubMap cleaned_meta_map; + // digest updates which we are waiting on int num_digest_updates_pending; @@ -1267,6 +1270,7 @@ public: missing.clear(); authoritative.clear(); num_digest_updates_pending = 0; + cleaned_meta_map = ScrubMap(); } void create_results(const hobject_t& obj); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 6887fa6c937..3ebff3dfb79 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -4061,6 +4061,14 @@ struct ScrubMap { eversion_t incr_since; void merge_incr(const ScrubMap &l); + void insert(const ScrubMap &r) { + objects.insert(r.objects.begin(), r.objects.end()); + } + void swap(ScrubMap &r) { + ::swap(objects, r.objects); + ::swap(valid_through, r.valid_through); + ::swap(incr_since, r.incr_since); + } void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl, int64_t pool=-1);