]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: use a temporary object for recovery
authorSage Weil <sage@redhat.com>
Tue, 30 Dec 2014 18:16:10 +0000 (10:16 -0800)
committerSage Weil <sage@redhat.com>
Fri, 19 Jun 2015 00:02:45 +0000 (17:02 -0700)
Currently we recover objects directly into position by deleting and then
overwriting the target object.  This means that we lose the object if we
are recovering in multiple steps and we fail partway through.

This is also the last user of collection_move(), which we would like to
deprecate.

Instead, generate a unique temp object name (pgid, object version, snap
is unique), and recover to that.  Use the existing temp object cleanup
machinery to throw out a partial recovery result.

Signed-off-by: Sage Weil <sage@redhat.com>
src/osd/ECBackend.cc
src/osd/PGBackend.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h

index b04776c15c384fa07d9fb642fcf609fd5a17b59c..7f2db01746b3b3a6876b27b02403b251acd6be91 100644 (file)
@@ -245,18 +245,25 @@ void ECBackend::handle_recovery_push(
 {
   bool oneshot = op.before_progress.first && op.after_progress.data_complete;
   coll_t tcoll = oneshot ? coll : get_temp_coll(m->t);
+  ghobject_t tobj;
+  if (oneshot) {
+    tobj = ghobject_t(op.soid, ghobject_t::NO_GEN,
+                     get_parent()->whoami_shard().shard);
+  } else {
+    tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.version,
+                                                            op.soid.snap),
+                     ghobject_t::NO_GEN,
+                     get_parent()->whoami_shard().shard);
+    if (op.before_progress.first) {
+      dout(10) << __func__ << ": Adding oid "
+              << tobj.hobj << " in the temp collection" << dendl;
+      add_temp_obj(tobj.hobj);
+    }
+  }
+
   if (op.before_progress.first) {
-    get_parent()->on_local_recover_start(
-      op.soid,
-      m->t);
-    m->t->remove(
-      get_temp_coll(m->t),
-      ghobject_t(
-       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
-    m->t->touch(
-      tcoll,
-      ghobject_t(
-       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+    m->t->remove(tcoll, tobj);
+    m->t->touch(tcoll, tobj);
   }
 
   if (!op.data_included.empty()) {
@@ -266,8 +273,7 @@ void ECBackend::handle_recovery_push(
 
     m->t->write(
       tcoll,
-      ghobject_t(
-       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      tobj,
       start,
       op.data.length(),
       op.data);
@@ -276,22 +282,22 @@ void ECBackend::handle_recovery_push(
   }
 
   if (op.before_progress.first) {
-    if (!oneshot)
-      add_temp_obj(op.soid);
     assert(op.attrset.count(string("_")));
     m->t->setattrs(
       tcoll,
-      ghobject_t(
-       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      tobj,
       op.attrset);
   }
 
   if (op.after_progress.data_complete && !oneshot) {
-    clear_temp_obj(op.soid);
-    m->t->collection_move(
-      coll,
-      tcoll,
-      ghobject_t(
+    dout(10) << __func__ << ": Removing oid "
+            << tobj.hobj << " from the temp collection" << dendl;
+    clear_temp_obj(tobj.hobj);
+    m->t->remove(coll, ghobject_t(
+       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+    m->t->collection_move_rename(
+      tcoll, tobj,
+      coll, ghobject_t(
        op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
   }
   if (op.after_progress.data_complete) {
index aa9b6ca5cec52f30a03ddad628a4bdf5b02d3c45..6cc2dd47b44ec39dcf30c37ef2c95f9f30dcdfaf 100644 (file)
@@ -56,9 +56,6 @@
    public:
      /// Recovery
 
-     virtual void on_local_recover_start(
-       const hobject_t &oid,
-       ObjectStore::Transaction *t) = 0;
      /**
       * Called with the transaction recovering oid
       */
      virtual uint64_t min_peer_features() const = 0;
 
      virtual bool transaction_use_tbl() = 0;
+     virtual hobject_t get_temp_recovery_object(eversion_t version,
+                                               snapid_t snap) = 0;
 
      virtual void send_message_osd_cluster(
        int peer, Message *m, epoch_t from_epoch) = 0;
index 19c3ee401be187d9134f38a3d10a2a262ac30f70..202121562b31b27c191af66824a7492bd6c46c5c 100644 (file)
@@ -1696,21 +1696,26 @@ void ReplicatedBackend::submit_push_data(
   ObjectStore::Transaction *t)
 {
   coll_t target_coll;
+  hobject_t target_oid;
   if (first && complete) {
     target_coll = coll;
+    target_oid = recovery_info.soid;
   } else {
-    dout(10) << __func__ << ": Creating oid "
-            << recovery_info.soid << " in the temp collection" << dendl;
-    add_temp_obj(recovery_info.soid);
     target_coll = get_temp_coll(t);
+    target_oid = get_parent()->get_temp_recovery_object(recovery_info.version,
+                                                       recovery_info.soid.snap);
+    if (first) {
+      dout(10) << __func__ << ": Adding oid "
+              << target_oid << " in the temp collection" << dendl;
+      add_temp_obj(target_oid);
+    }
   }
 
   if (first) {
-    get_parent()->on_local_recover_start(recovery_info.soid, t);
-    t->remove(get_temp_coll(t), recovery_info.soid);
-    t->touch(target_coll, recovery_info.soid);
-    t->truncate(target_coll, recovery_info.soid, recovery_info.size);
-    t->omap_setheader(target_coll, recovery_info.soid, omap_header);
+    t->remove(target_coll, target_oid);
+    t->touch(target_coll, target_oid);
+    t->truncate(target_coll, target_oid, recovery_info.size);
+    t->omap_setheader(target_coll, target_oid, omap_header);
   }
   uint64_t off = 0;
   for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
@@ -1718,22 +1723,21 @@ void ReplicatedBackend::submit_push_data(
        ++p) {
     bufferlist bit;
     bit.substr_of(data_included, off, p.get_len());
-    t->write(target_coll, recovery_info.soid,
+    t->write(target_coll, target_oid,
             p.get_start(), p.get_len(), bit);
     off += p.get_len();
   }
 
-  t->omap_setkeys(target_coll, recovery_info.soid,
-                 omap_entries);
-  t->setattrs(target_coll, recovery_info.soid,
-             attrs);
+  t->omap_setkeys(target_coll, target_oid, omap_entries);
+  t->setattrs(target_coll, target_oid, attrs);
 
   if (complete) {
     if (!first) {
       dout(10) << __func__ << ": Removing oid "
-              << recovery_info.soid << " from the temp collection" << dendl;
-      clear_temp_obj(recovery_info.soid);
-      t->collection_move(coll, target_coll, recovery_info.soid);
+              << target_oid << " from the temp collection" << dendl;
+      clear_temp_obj(target_oid);
+      t->remove(coll, recovery_info.soid);
+      t->collection_move_rename(target_coll, target_oid, coll, recovery_info.soid);
     }
 
     submit_push_complete(recovery_info, t);
index a443899a572addbf7acff4cd80ebdf0e4836ea89..75d14f995586f2985e4c1fecfe67f38c4b2a7645 100644 (file)
@@ -175,15 +175,6 @@ public:
 // ======================
 // PGBackend::Listener
 
-
-void ReplicatedPG::on_local_recover_start(
-  const hobject_t &oid,
-  ObjectStore::Transaction *t)
-{
-  pg_log.revise_have(oid, eversion_t());
-  remove_snap_mapped_object(*t, oid);
-}
-
 void ReplicatedPG::on_local_recover(
   const hobject_t &hoid,
   const object_stat_sum_t &stat_diff,
@@ -193,7 +184,9 @@ void ReplicatedPG::on_local_recover(
   )
 {
   dout(10) << __func__ << ": " << hoid << dendl;
+
   ObjectRecoveryInfo recovery_info(_recovery_info);
+  clear_object_snap_mapping(t, hoid);
   if (recovery_info.soid.snap < CEPH_NOSNAP) {
     assert(recovery_info.oi.snaps.size());
     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
@@ -5657,6 +5650,19 @@ hobject_t ReplicatedPG::generate_temp_object()
   return hoid;
 }
 
+hobject_t ReplicatedPG::get_temp_recovery_object(eversion_t version, snapid_t snap)
+{
+  ostringstream ss;
+  ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
+     << "_" << version
+     << "_" << info.history.same_interval_since
+     << "_" << snap;
+  // pgid + version + interval + snapid is unique, and short
+  hobject_t hoid = hobject_t::make_temp(ss.str());
+  dout(20) << __func__ << " " << hoid << dendl;
+  return hoid;
+}
+
 int ReplicatedPG::prepare_transaction(OpContext *ctx)
 {
   assert(!ctx->ops.empty());
@@ -8327,6 +8333,7 @@ void ReplicatedPG::send_remove_op(
   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
 }
 
+
 void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
 {
   dout(10) << "finish_degraded_object " << oid << dendl;
index 5bc36e49bac35fe790fd07546c34bb12c8b318ed..94ef080dbb8f3b3014642983b599a47347be14d2 100644 (file)
@@ -267,9 +267,6 @@ public:
   }
 
   /// Listener methods
-  void on_local_recover_start(
-    const hobject_t &oid,
-    ObjectStore::Transaction *t);
   void on_local_recover(
     const hobject_t &oid,
     const object_stat_sum_t &stat_diff,
@@ -1430,6 +1427,8 @@ private:
   uint64_t temp_seq; ///< last id for naming temp objects
   coll_t get_temp_coll(ObjectStore::Transaction *t);
   hobject_t generate_temp_object();  ///< generate a new temp object name
+  /// generate a new temp object name (for recovery)
+  hobject_t get_temp_recovery_object(eversion_t version, snapid_t snap);
 public:
   void get_colls(list<coll_t> *out) {
     out->push_back(coll);