]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: change build_push_op and submit_push_data base recovery strategy
authorlishuhao <lishuhao@unitedstack.com>
Sun, 14 Apr 2019 11:41:55 +0000 (19:41 +0800)
committerletterwuyu <lishuhao@unitedstack.com>
Mon, 6 May 2019 12:17:32 +0000 (20:17 +0800)
Signed-off-by: Ning Yao <yaoning@unitedstack.com>
Signed-off-by: lishuhao <lishuhao@unitedstack.com>
src/messages/MOSDPGLog.h
src/osd/PGBackend.h
src/osd/PrimaryLogPG.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedBackend.h
src/osd/osd_types.cc
src/osd/osd_types.h

index 041e4bd0739b159122b73b92167567149997c94b..de3a50e0c0924b79805f9dd8b34f85fdf6bcde1b 100644 (file)
@@ -96,7 +96,7 @@ public:
     encode(epoch, payload);
     encode(info, payload);
     encode(log, payload);
-    encode(missing, payload);
+    encode(missing, payload, features);
     if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
       // pre-nautilus OSDs do not set last_peering_reset properly
       encode(epoch, payload);
index d8fca3277e05f52084c6f0c6df38743ca8637e24..3dfc5a918d45a7303efab6f7b50c9e92da50ef1f 100644 (file)
@@ -270,7 +270,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual spg_t primary_spg_t() const = 0;
      virtual pg_shard_t primary_shard() const = 0;
-
+     virtual uint64_t min_peer_features() const = 0;
      virtual hobject_t get_temp_recovery_object(const hobject_t& target,
                                                eversion_t version) = 0;
 
index f445df1f0ec81ebd7624a8d2af5adbdf57b63727..5942be6c765073f90ff1736a529349aa799e22ee 100644 (file)
@@ -511,7 +511,9 @@ public:
   pg_shard_t primary_shard() const override {
     return get_primary();
   }
-
+  uint64_t min_peer_features() const override {
+    return recovery_state.get_min_peer_features();
+  }
   void send_message_osd_cluster(
     int peer, Message *m, epoch_t from_epoch) override;
   void send_message_osd_cluster(
index 8063e0a9e132a139c1c4941ea631206582694413..5956750bbd70bb3d4eb43a5982fdd5a4ff92a824 100644 (file)
@@ -1143,6 +1143,14 @@ void ReplicatedBackend::calc_head_subsets(
   if (size)
     data_subset.insert(0, size);
 
+  if (HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS)) {
+    const auto it = missing.get_items().find(head);
+    assert(it != missing.get_items().end());
+    data_subset.intersection_of(it->second.clean_regions.get_dirty_regions());
+    dout(10) << "calc_head_subsets " << head
+             << " data_subset " << data_subset << dendl;
+  }
+
   if (get_parent()->get_pool().allow_incomplete_clones()) {
     dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
     return;
@@ -1156,11 +1164,11 @@ void ReplicatedBackend::calc_head_subsets(
 
   interval_set<uint64_t> cloning;
   interval_set<uint64_t> prev;
+  hobject_t c = head;
   if (size)
     prev.insert(0, size);
 
   for (int j=snapset.clones.size()-1; j>=0; j--) {
-    hobject_t c = head;
     c.snap = snapset.clones[j];
     prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
     if (!missing.is_missing(c) &&
@@ -1168,23 +1176,29 @@ void ReplicatedBackend::calc_head_subsets(
        get_parent()->try_lock_for_read(c, manager)) {
       dout(10) << "calc_head_subsets " << head << " has prev " << c
               << " overlap " << prev << dendl;
-      clone_subsets[c] = prev;
-      cloning.union_of(prev);
+      cloning = prev;
       break;
     }
     dout(10) << "calc_head_subsets " << head << " does not have prev " << c
             << " overlap " << prev << dendl;
   }
 
+  cloning.intersection_of(data_subset);
+  if (cloning.empty()) {
+    dout(10) << "skipping clone, nothing needs to clone" << dendl;
+    return;
+  }
 
   if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) {
     dout(10) << "skipping clone, too many holes" << dendl;
     get_parent()->release_locks(manager);
     clone_subsets.clear();
     cloning.clear();
+    return;
   }
 
   // what's left for us to push?
+  clone_subsets[c] = cloning;
   data_subset.subtract(cloning);
 
   dout(10) << "calc_head_subsets " << head
@@ -1287,9 +1301,9 @@ void ReplicatedBackend::prepare_pull(
   ObjectContextRef headctx,
   RPGHandle *h)
 {
-  ceph_assert(get_parent()->get_local_missing().get_items().count(soid));
-  eversion_t _v = get_parent()->get_local_missing().get_items().find(
-    soid)->second.need;
+  const auto missing_iter = get_parent()->get_local_missing().get_items().find(soid);
+  ceph_assert(missing_iter != get_parent()->get_local_missing().get_items().end());
+  eversion_t _v = missing_iter->second.need;
   ceph_assert(_v == v);
   const map<hobject_t, set<pg_shard_t>> &missing_loc(
     get_parent()->get_missing_loc_shards());
@@ -1363,11 +1377,15 @@ void ReplicatedBackend::prepare_pull(
 
     ceph_assert(ssc->snapset.clone_size.count(soid.snap));
     recovery_info.size = ssc->snapset.clone_size[soid.snap];
+    recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
   } else {
     // pulling head or unversioned object.
     // always pull the whole thing.
     recovery_info.copy_subset.insert(0, (uint64_t)-1);
+    if (HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS))
+      recovery_info.copy_subset.intersection_of(missing_iter->second.clean_regions.get_dirty_regions());
     recovery_info.size = ((uint64_t)-1);
+    recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
   }
 
   h->pulls[fromshard].push_back(PullOp());
@@ -1378,7 +1396,8 @@ void ReplicatedBackend::prepare_pull(
   op.recovery_info.soid = soid;
   op.recovery_info.version = v;
   op.recovery_progress.data_complete = false;
-  op.recovery_progress.omap_complete = false;
+  op.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() 
+                                && HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS);
   op.recovery_progress.data_recovered_to = 0;
   op.recovery_progress.first = true;
 
@@ -1491,6 +1510,9 @@ int ReplicatedBackend::prep_push(
   ObcLockManager &&lock_manager)
 {
   get_parent()->begin_peer_recover(peer, soid);
+  const auto pmissing_iter = get_parent()->get_shard_missing().find(peer);
+  const auto missing_iter = pmissing_iter->second.get_items().find(soid);
+  assert(missing_iter != pmissing_iter->second.get_items().end());
   // take note.
   PushInfo &pi = pushing[soid][peer];
   pi.obc = obc;
@@ -1501,6 +1523,9 @@ int ReplicatedBackend::prep_push(
   pi.recovery_info.oi = obc->obs.oi;
   pi.recovery_info.ss = pop->recovery_info.ss;
   pi.recovery_info.version = version;
+  pi.recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+  pi.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() &&
+    HAVE_FEATURE(parent->min_peer_features(), SERVER_NAUTILUS);
   pi.lock_manager = std::move(lock_manager);
 
   ObjectRecoveryProgress new_progress;
@@ -1519,7 +1544,9 @@ void ReplicatedBackend::submit_push_data(
   const ObjectRecoveryInfo &recovery_info,
   bool first,
   bool complete,
+  bool clear_omap,
   bool cache_dont_need,
+  interval_set<uint64_t> &data_zeros,
   const interval_set<uint64_t> &intervals_included,
   bufferlist data_included,
   bufferlist omap_header,
@@ -1541,25 +1568,43 @@ void ReplicatedBackend::submit_push_data(
   }
 
   if (first) {
-    t->remove(coll, ghobject_t(target_oid));
-    t->touch(coll, ghobject_t(target_oid));
+    if (!complete) {
+      t->remove(coll, ghobject_t(target_oid));
+      t->touch(coll, ghobject_t(target_oid));
+      bufferlist bv = attrs.at(OI_ATTR);
+      object_info_t oi(bv);
+      t->set_alloc_hint(coll, ghobject_t(target_oid),
+                       oi.expected_object_size,
+                       oi.expected_write_size,
+                       oi.alloc_hint_flags);
+      } else {
+        if (!recovery_info.object_exist) {
+         t->remove(coll, ghobject_t(target_oid));
+          t->touch(coll, ghobject_t(target_oid));
+          bufferlist bv = attrs.at(OI_ATTR);
+          object_info_t oi(bv);
+          t->set_alloc_hint(coll, ghobject_t(target_oid),
+                            oi.expected_object_size,
+                            oi.expected_write_size,
+                            oi.alloc_hint_flags);
+        }
+        //remove xattr and update later if overwrite on original object
+        t->rmattrs(coll, ghobject_t(target_oid));
+        //if need update omap, clear the previous content first
+        if (clear_omap)
+          t->omap_clear(coll, ghobject_t(target_oid));
+      }
+
     t->truncate(coll, ghobject_t(target_oid), recovery_info.size);
-    if (omap_header.length()) 
+    if (omap_header.length())
       t->omap_setheader(coll, ghobject_t(target_oid), omap_header);
 
-    bufferlist bv = attrs.at(OI_ATTR);
-    object_info_t oi(bv);
-    t->set_alloc_hint(coll, ghobject_t(target_oid),
-                     oi.expected_object_size,
-                     oi.expected_write_size,
-                     oi.alloc_hint_flags);
+    struct stat st;
+    int r = store->stat(ch, ghobject_t(recovery_info.soid), &st);
     if (get_parent()->pg_is_remote_backfilling()) {
-      struct stat st;
       uint64_t size = 0;
-      int r = store->stat(ch, ghobject_t(recovery_info.soid), &st);
-      if (r == 0) {
+      if (r == 0)
         size = st.st_size;
-      }
       // Don't need to do anything if object is still the same size
       if (size != recovery_info.oi.size) {
         get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
@@ -1571,11 +1616,46 @@ void ReplicatedBackend::submit_push_data(
                << dendl;
       }
     }
+    if (!complete) {
+      //clone overlap content in local object
+      if (recovery_info.object_exist) {
+        assert(r == 0);
+        uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size);
+        interval_set<uint64_t> local_intervals_included, local_intervals_excluded;
+        if (local_size) {
+          local_intervals_included.insert(0, local_size);
+          local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset);
+          local_intervals_included.subtract(local_intervals_excluded);
+        }
+       for (interval_set<uint64_t>::const_iterator q = local_intervals_included.begin();
+          q != local_intervals_included.end();
+         ++q) {
+         dout(15) << " clone_range " << recovery_info.soid << " "
+                  << q.get_start() << "~" << q.get_len() << dendl;
+         t->clone_range(coll, ghobject_t(recovery_info.soid), ghobject_t(target_oid),
+             q.get_start(), q.get_len(), q.get_start());
+        }
+      }
+    }
   }
   uint64_t off = 0;
   uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
   if (cache_dont_need)
     fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+  // Punch zeros for data, if fiemap indicates nothing but it is marked dirty
+  if (data_zeros.size() > 0) {
+    data_zeros.intersection_of(recovery_info.copy_subset);
+    assert(intervals_included.subset_of(data_zeros));
+    data_zeros.subtract(intervals_included);
+
+    dout(20) << __func__ <<" recovering object " << recovery_info.soid
+             << " copy_subset: " << recovery_info.copy_subset
+             << " intervals_inlcuded: " << intervals_included
+             << " data_zeros: " << data_zeros << dendl;
+
+    for (auto p = data_zeros.begin(); p != data_zeros.end(); ++p)
+      t->zero(coll, ghobject_t(target_oid), p.get_start(), p.get_len());
+  }
   for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
        p != intervals_included.end();
        ++p) {
@@ -1594,14 +1674,15 @@ void ReplicatedBackend::submit_push_data(
   if (complete) {
     if (!first) {
       dout(10) << __func__ << ": Removing oid "
-              << target_oid << " from the temp collection" << dendl;
+               << target_oid << " from the temp collection" << dendl;
       clear_temp_obj(target_oid);
       t->remove(coll, ghobject_t(recovery_info.soid));
       t->collection_move_rename(coll, ghobject_t(target_oid),
-                               coll, ghobject_t(recovery_info.soid));
+                                coll, ghobject_t(recovery_info.soid));
     }
 
     submit_push_complete(recovery_info, t);
+
   }
 }
 
@@ -1666,7 +1747,7 @@ bool ReplicatedBackend::handle_pull_response(
 
   const hobject_t &hoid = pop.soid;
   ceph_assert((data_included.empty() && data.length() == 0) ||
-        (!data_included.empty() && data.length() > 0));
+         (!data_included.empty() && data.length() > 0));
 
   auto piter = pulling.find(hoid);
   if (piter == pulling.end()) {
@@ -1719,18 +1800,28 @@ bool ReplicatedBackend::handle_pull_response(
   pi.recovery_progress = pop.after_progress;
 
   dout(10) << "new recovery_info " << pi.recovery_info
-          << ", new progress " << pi.recovery_progress
-          << dendl;
-
+           << ", new progress " << pi.recovery_progress
+           << dendl;
+  interval_set<uint64_t> data_zeros;
+  uint64_t z_offset = pop.before_progress.data_recovered_to;
+  uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to;
+  if(z_length)
+    data_zeros.insert(z_offset, z_length);
   bool complete = pi.is_complete();
-
-  submit_push_data(pi.recovery_info, first,
-                  complete, pi.cache_dont_need,
-                  data_included, data,
-                  pop.omap_header,
-                  pop.attrset,
-                  pop.omap_entries,
-                  t);
+  bool clear_omap = !pop.before_progress.omap_complete;
+
+  submit_push_data(pi.recovery_info,
+                  first,
+                  complete,
+                  clear_omap,
+                  pi.cache_dont_need,
+                  data_zeros,
+                  data_included,
+                  data,
+                  pop.omap_header,
+                  pop.attrset,
+                  pop.omap_entries,
+                  t);
 
   pi.stat.num_keys_recovered += pop.omap_entries.size();
   pi.stat.num_bytes_recovered += data.length();
@@ -1769,12 +1860,20 @@ void ReplicatedBackend::handle_push(
   bool first = pop.before_progress.first;
   bool complete = pop.after_progress.data_complete &&
     pop.after_progress.omap_complete;
-
+  bool clear_omap = !pop.before_progress.omap_complete;
+  interval_set<uint64_t> data_zeros;
+  uint64_t z_offset = pop.before_progress.data_recovered_to;
+  uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to;
+  if(z_length)
+    data_zeros.insert(z_offset, z_length);
   response->soid = pop.recovery_info.soid;
+
   submit_push_data(pop.recovery_info,
                   first,
                   complete,
+                  clear_omap,
                   true, // must be replicate
+                  data_zeros,
                   pop.data_included,
                   data,
                   pop.omap_header,
@@ -2025,6 +2124,9 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
       if (get_parent()->pg_is_repair())
         stat->num_objects_repaired++;
     }
+  } else if (progress.first && progress.omap_complete) {
+    // If omap is not changed, we need recovery omap when recovery cannot be completed once
+    new_progress.omap_complete = false;
   }
 
   if (stat) {
@@ -2138,10 +2240,14 @@ void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply)
     if (progress.first && recovery_info.size == ((uint64_t)-1)) {
       // Adjust size and copy_subset
       recovery_info.size = st.st_size;
-      recovery_info.copy_subset.clear();
-      if (st.st_size)
-        recovery_info.copy_subset.insert(0, st.st_size);
-      ceph_assert(recovery_info.clone_subset.empty());
+      if (st.st_size) {
+        interval_set<uint64_t> object_range;
+        object_range.insert(0, st.st_size);
+        recovery_info.copy_subset.intersection_of(object_range);
+      } else {
+        recovery_info.copy_subset.clear();
+      }
+      assert(recovery_info.clone_subset.empty());
     }
 
     r = build_push_op(recovery_info, progress, 0, reply);
index 8f447495a4ed106d7b09136c2694c22675fc1023..6e89f8ca8755943504db78fdf3f5e3cdc962da36 100644 (file)
@@ -268,7 +268,9 @@ private:
   void submit_push_data(const ObjectRecoveryInfo &recovery_info,
                        bool first,
                        bool complete,
+                       bool clear_omap,
                        bool cache_dont_need,
+                       interval_set<uint64_t> &data_zeros,
                        const interval_set<uint64_t> &intervals_included,
                        bufferlist data_included,
                        bufferlist omap_header,
@@ -335,7 +337,7 @@ private:
       ceph_tid_t tid, Context *on_commit,
       OpRequestRef op, eversion_t v)
       : RefCountedObject(nullptr, 0),
-       tid(tid), on_commit(on_commit),
+      tid(tid), on_commit(on_commit),
        op(op), v(v) {}
     bool done() const {
       return waiting_for_commit.empty();
index 58b6f931568d09652d3b2e35e4f07dd4b4a02b2c..a0ad552c0efc3461b0e2c625dcc1b203a8c4834f 100644 (file)
@@ -5891,7 +5891,7 @@ void ObjectRecoveryProgress::dump(Formatter *f) const
 
 void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
 {
-  ENCODE_START(2, 1, bl);
+  ENCODE_START(3, 1, bl);
   encode(soid, bl);
   encode(version, bl);
   encode(size, bl);
@@ -5899,13 +5899,14 @@ void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
   encode(ss, bl);
   encode(copy_subset, bl);
   encode(clone_subset, bl);
+  encode(object_exist, bl);
   ENCODE_FINISH(bl);
 }
 
 void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
                                int64_t pool)
 {
-  DECODE_START(2, bl);
+  DECODE_START(3, bl);
   decode(soid, bl);
   decode(version, bl);
   decode(size, bl);
@@ -5913,8 +5914,9 @@ void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
   decode(ss, bl);
   decode(copy_subset, bl);
   decode(clone_subset, bl);
+  if (struct_v > 2)
+    decode(object_exist, bl);
   DECODE_FINISH(bl);
-
   if (struct_v < 2) {
     if (!soid.is_max() && soid.pool == -1)
       soid.pool = pool;
@@ -5936,6 +5938,7 @@ void ObjectRecoveryInfo::generate_test_instances(
   o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
   o.back()->version = eversion_t(0,0);
   o.back()->size = 100;
+  o.back()->object_exist = false;
 }
 
 
@@ -5956,6 +5959,7 @@ void ObjectRecoveryInfo::dump(Formatter *f) const
   }
   f->dump_stream("copy_subset") << copy_subset;
   f->dump_stream("clone_subset") << clone_subset;
+  f->dump_stream("object_exist") << object_exist;
 }
 
 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
@@ -5971,6 +5975,7 @@ ostream &ObjectRecoveryInfo::print(ostream &out) const
             << ", copy_subset: " << copy_subset
             << ", clone_subset: " << clone_subset
             << ", snapset: " << ss
+            << ", object_exist: " << object_exist
             << ")";
 }
 
index 57054e55ea5be15d58ae5d58b8534e772e2ae80d..8f42b4c3ecc8f2ee6b8c1a02dd5ebbc1869bf05a 100644 (file)
@@ -5421,8 +5421,9 @@ struct ObjectRecoveryInfo {
   SnapSet ss;   // only populated if soid is_snap()
   interval_set<uint64_t> copy_subset;
   std::map<hobject_t, interval_set<uint64_t>> clone_subset;
+  bool object_exist;
 
-  ObjectRecoveryInfo() : size(0) { }
+  ObjectRecoveryInfo() : size(0), object_exist(true) { }
 
   static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
   void encode(ceph::buffer::list &bl, uint64_t features) const;