]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
src/osd: relax the requirement that we scrub a whole hash value
authorSamuel Just <sjust@redhat.com>
Wed, 28 Sep 2016 20:24:56 +0000 (13:24 -0700)
committerDavid Zafman <dzafman@redhat.com>
Tue, 4 Oct 2016 04:43:34 +0000 (21:43 -0700)
Previously, we needed to scrub all objects in clones in a single
hash value mainly to ensure that _scrub had access to all clones
of a single object at the same time.  Instead, just avoid letting
head or snapdir be a boundary (see the comment in the commit
for details).

Signed-off-by: Samuel Just <sjust@redhat.com>
(cherry picked from commit 27bdc8ce6d0a7d8ae47f29540f281ba417e16b4c)

src/common/hobject.h
src/osd/PG.cc
src/osd/PG.h
src/osd/osd_types.h

index 77f2081b7208e7480117f6e14b7de292fad8e140..9be84c5000ab57b76b6eaf9dc990abc554644bd7 100644 (file)
@@ -138,6 +138,14 @@ public:
     return ret;
   }
 
+  hobject_t get_object_boundary() const {
+    if (is_max())
+      return *this;
+    hobject_t ret = *this;
+    ret.snap = 0;
+    return ret;
+  }
+
   /// @return head version of this hobject_t
   hobject_t get_head() const {
     hobject_t ret(*this);
index 69331a5243ac2c90aed83b7b6f5c302edb079c31..597f456fadd50abd8948f30b158be40fad00cf1b 100644 (file)
@@ -4118,54 +4118,54 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         scrubber.received_maps.clear();
 
         {
-         hobject_t candidate_end;
-
-          // get the start and end of our scrub chunk
-          //
-          // start and end need to lie on a hash boundary. We test for this by
-          // requesting a list and searching backward from the end looking for a
-          // boundary. If there's no boundary, we request a list after the first
-          // list, and so forth.
-
-          bool boundary_found = false;
+          /* get the start and end of our scrub chunk
+          *
+          * Our scrub chunk has an important restriction we're going to need to
+          * respect. We can't let head or snapdir be start or end.
+          * Using a half-open interval means that if end == head|snapdir,
+          * we'd scrub/lock head and the clone right next to head in different
+          * chunks which would allow us to miss clones created between
+          * scrubbing that chunk and scrubbing the chunk including head.
+          * This isn't true for any of the other clones since clones can
+          * only be created "just to the left of" head.  There is one exception
+          * to this: promotion of clones which always happens to the left of the
+          * left-most clone, but promote_object checks the scrubber in that
+          * case, so it should be ok.  Also, it's ok to "miss" clones at the
+          * left end of the range if we are a tier because they may legitimately
+          * not exist (see _scrub).
+          */
+         unsigned min = MAX(3, cct->_conf->osd_scrub_chunk_min);
           hobject_t start = scrubber.start;
-          unsigned loop = 0;
-          while (!boundary_found) {
-            vector<hobject_t> objects;
-            ret = get_pgbackend()->objects_list_partial(
-             start,
-             cct->_conf->osd_scrub_chunk_min,
-             cct->_conf->osd_scrub_chunk_max,
-             &objects,
-             &candidate_end);
-            assert(ret >= 0);
-
-            // in case we don't find a boundary: start again at the end
-            start = candidate_end;
-
-            // special case: reached end of file store, implicitly a boundary
-            if (objects.empty()) {
-              break;
-            }
-
-            // search backward from the end looking for a boundary
-            objects.push_back(candidate_end);
-            while (!boundary_found && objects.size() > 1) {
-              hobject_t end = objects.back().get_boundary();
-              objects.pop_back();
-
-              if (objects.back().get_hash() != end.get_hash()) {
-                candidate_end = end;
-                boundary_found = true;
-              }
-            }
-
-            // reset handle once in a while, the search maybe takes long.
-            if (++loop >= g_conf->osd_loop_before_reset_tphandle) {
-              handle.reset_tp_timeout();
-              loop = 0;
-            }
-          }
+         hobject_t candidate_end;
+         vector<hobject_t> objects;
+         ret = get_pgbackend()->objects_list_partial(
+           start,
+           min,
+           MAX(min, cct->_conf->osd_scrub_chunk_max),
+           &objects,
+           &candidate_end);
+         assert(ret >= 0);
+
+         if (!objects.empty()) {
+           hobject_t back = objects.back();
+           while (candidate_end.has_snapset() &&
+                     candidate_end.get_head() == back.get_head()) {
+             candidate_end = back;
+             objects.pop_back();
+             if (objects.empty()) {
+               assert(0 ==
+                      "Somehow we got more than 2 objects which"
+                      "have the same head but are not clones");
+             }
+             back = objects.back();
+           }
+           if (candidate_end.has_snapset()) {
+             assert(candidate_end.get_head() != back.get_head());
+             candidate_end = candidate_end.get_object_boundary();
+           }
+         } else {
+           assert(candidate_end.is_max());
+         }
 
          if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
            // we'll be requeued by whatever made us unavailable for scrub
@@ -4190,7 +4190,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           }
         }
 
-        // ask replicas to wait until last_update_applied >= scrubber.subset_last_update and then scan
+        // ask replicas to wait until
+        // last_update_applied >= scrubber.subset_last_update and then scan
         scrubber.waiting_on_whom.insert(pg_whoami);
         ++scrubber.waiting_on;
 
@@ -4337,7 +4338,7 @@ void PG::scrub_compare_maps()
   dout(10) << __func__ << " has maps, analyzing" << dendl;
 
   // construct authoritative scrub map for type specific scrubbing
-  ScrubMap authmap(scrubber.primary_scrubmap);
+  scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
   map<hobject_t, pair<uint32_t, uint32_t>, hobject_t::BitwiseComparator> missing_digest;
 
   if (acting.size() > 1) {
@@ -4399,13 +4400,34 @@ void PG::scrub_compare_maps()
     for (map<hobject_t, list<pg_shard_t>, hobject_t::BitwiseComparator>::iterator i = authoritative.begin();
         i != authoritative.end();
         ++i) {
-      authmap.objects.erase(i->first);
-      authmap.objects.insert(*(maps[i->second.back()]->objects.find(i->first)));
+      scrubber.cleaned_meta_map.objects.erase(i->first);
+      scrubber.cleaned_meta_map.objects.insert(
+       *(maps[i->second.back()]->objects.find(i->first))
+       );
+    }
+  }
+
+  ScrubMap for_meta_scrub;
+  if (scrubber.end.is_max() ||
+      scrubber.cleaned_meta_map.objects.empty()) {
+    scrubber.cleaned_meta_map.swap(for_meta_scrub);
+  } else {
+    auto iter = scrubber.cleaned_meta_map.objects.end();
+    --iter; // not empty, see if clause
+    auto begin = scrubber.cleaned_meta_map.objects.begin();
+    while (iter != begin) {
+      auto next = iter--;
+      if (next->first.get_head() != iter->first.get_head()) {
+       ++iter;
+       break;
+      }
     }
+    for_meta_scrub.objects.insert(begin, iter);
+    scrubber.cleaned_meta_map.objects.erase(begin, iter);
   }
 
   // ok, do the pg-type specific scrubbing
-  _scrub(authmap, missing_digest);
+  _scrub(for_meta_scrub, missing_digest);
   if (!scrubber.store->empty()) {
     if (state_test(PG_STATE_REPAIR)) {
       dout(10) << __func__ << ": discarding scrub results" << dendl;
index 10a81f8336f662af47db0c07a11f79ffc2d0bd79..cda845f68f1750e7821804962894b5f11034c9cc 100644 (file)
@@ -1169,6 +1169,9 @@ public:
     // Map from object with errors to good peers
     map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >, hobject_t::BitwiseComparator> authoritative;
 
+    // Cleaned map pending snap metadata scrub
+    ScrubMap cleaned_meta_map;
+
     // digest updates which we are waiting on
     int num_digest_updates_pending;
 
@@ -1267,6 +1270,7 @@ public:
       missing.clear();
       authoritative.clear();
       num_digest_updates_pending = 0;
+      cleaned_meta_map = ScrubMap();
     }
 
     void create_results(const hobject_t& obj);
index 6887fa6c937080ae55e611cfb4624d39315be78f..3ebff3dfb7955b67710ace7d6d49c71dc5714ad0 100644 (file)
@@ -4061,6 +4061,14 @@ struct ScrubMap {
   eversion_t incr_since;
 
   void merge_incr(const ScrubMap &l);
+  void insert(const ScrubMap &r) {
+    objects.insert(r.objects.begin(), r.objects.end());
+  }
+  void swap(ScrubMap &r) {
+    ::swap(objects, r.objects);
+    ::swap(valid_through, r.valid_through);
+    ::swap(incr_since, r.incr_since);
+  }
 
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl, int64_t pool=-1);