]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: recover degraded objects _before_ modifying it
authorSage Weil <sage@newdream.net>
Mon, 19 Jul 2010 21:44:09 +0000 (14:44 -0700)
committerSage Weil <sage@newdream.net>
Mon, 19 Jul 2010 21:44:09 +0000 (14:44 -0700)
This will slow down writes to degraded objects because we will wait for it
to recover before applying the write.  OTOH it will be robust in the case
of large objects.  We can optimize the small object update (and overwrite)
cases later.

Signed-off-by: Sage Weil <sage@newdream.net>
src/osd/OSD.cc
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h

index d61f7a74bfa8e09b092446d5b5e874b3bd18cb88..3494c79afcfde000afa50c93d28c7604e04a1714 100644 (file)
@@ -4411,14 +4411,20 @@ void OSD::handle_op(MOSDOp *op)
     }
   }
 
-  // missing object?
   if ((op->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) {
+    // missing object?
     sobject_t head(op->get_oid(), CEPH_NOSNAP);
     if (pg->is_missing_object(head)) {
       pg->wait_for_missing_object(head, op);
       pg->unlock();
       return;
     }
+
+    if (op->may_write() && pg->is_degraded_object(head)) {
+      pg->wait_for_degraded_object(head, op);
+      pg->unlock();
+      return;
+    }
   }
 
   
index 0032c41afe160ea6bb0f4a72a5022c826d91fe58..f4932758519a10fdd8695034e0eacbeaa8910d5e 100644 (file)
@@ -2408,6 +2408,14 @@ bool PG::block_if_wrlocked(MOSDOp* op, object_info_t& oi)
   return false; //the object wasn't locked, so the operation can be handled right away
 }
 
+void PG::take_object_waiters(hash_map<sobject_t, list<Message*> >& m)
+{
+  for (hash_map<sobject_t, list<Message*> >::iterator it = m.begin();
+       it != m.end();
+       it++)
+    osd->take_waiters(it->second);
+  m.clear();
+}
 
 
 // ==========================================================================================
index 6157aee0f43d00f7a9d332195fa90a4f0297a78c..c0948c0d3ee00eef2edac79b4ddc651558210f66 100644 (file)
@@ -764,8 +764,10 @@ public:
   // pg waiters
   list<class Message*>            waiting_for_active;
   hash_map<sobject_t, 
-           list<class Message*> > waiting_for_missing_object;   
+           list<class Message*> > waiting_for_missing_object, waiting_for_degraded_object;   
   map<eversion_t,class MOSDOp*>   replay_queue;
+
+  void take_object_waiters(hash_map<sobject_t, list<Message*> >& m);
   
   hash_map<sobject_t, list<Message*> > waiting_for_wr_unlock; 
 
@@ -981,6 +983,9 @@ public:
   virtual bool is_missing_object(const sobject_t& oid) = 0;
   virtual void wait_for_missing_object(const sobject_t& oid, Message *op) = 0;
 
+  virtual bool is_degraded_object(const sobject_t& oid) = 0;
+  virtual void wait_for_degraded_object(const sobject_t& oid, Message *op) = 0;
+
   virtual void on_osd_failure(int osd) = 0;
   virtual void on_role_change() = 0;
   virtual void on_change() = 0;
index 2d155342f29dd73703c9fe904dd7ab4c674190d0..43bccbb2729b725e8970df4696bc330f902be965 100644 (file)
@@ -78,7 +78,6 @@ bool ReplicatedPG::is_missing_object(const sobject_t& soid)
 {
   return missing.missing.count(soid);
 }
 
 void ReplicatedPG::wait_for_missing_object(const sobject_t& soid, Message *m)
 {
@@ -103,6 +102,39 @@ void ReplicatedPG::wait_for_missing_object(const sobject_t& soid, Message *m)
   waiting_for_missing_object[soid].push_back(m);
 }
 
+bool ReplicatedPG::is_degraded_object(const sobject_t& soid)
+{
+  if (missing.missing.count(soid))
+    return true;
+  for (unsigned i = 1; i < acting.size(); i++) {
+    int peer = acting[i];
+    if (peer_missing.count(peer) &&
+       peer_missing[peer].missing.count(soid))
+      return true;
+  }
+  return false;
+}
+
+void ReplicatedPG::wait_for_degraded_object(const sobject_t& soid, Message *m)
+{
+  assert(is_degraded_object(soid));
+
+  // we don't have it (yet).
+  if (pushing.count(soid)) {
+    dout(7) << "degraded "
+           << soid 
+           << ", already pushing"
+           << dendl;
+  } else {
+    dout(7) << "degraded " 
+           << soid 
+           << ", pushing"
+           << dendl;
+    recover_object_replicas(soid);
+  }
+  waiting_for_degraded_object[soid].push_back(m);
+}
+
 
 // ==========================================================
 
@@ -2259,16 +2291,6 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now,
 
   for (unsigned i=1; i<acting.size(); i++) {
     int peer = acting[i];
-
-    if (peer_missing.count(peer) &&
-        peer_missing[peer].is_missing(soid)) {
-      // push it before this update. 
-      // FIXME, this is probably extra much work (eg if we're about to overwrite)
-      repop->obc->ondisk_read_lock();
-      push_to_replica(soid, peer);
-      start_recovery_op(soid);
-      repop->obc->ondisk_read_unlock();
-    }
     
     // forward the write/update/whatever
     MOSDSubOp *wr = new MOSDSubOp(repop->ctx->reqid, info.pgid, soid,
@@ -3278,6 +3300,10 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply)
     if (pushing[soid].empty()) {
       dout(10) << "pushed " << soid << " to all replicas" << dendl;
       finish_recovery_op(soid);
+      if (waiting_for_degraded_object.count(soid)) {
+       osd->take_waiters(waiting_for_degraded_object[soid]);
+       waiting_for_degraded_object.erase(soid);
+      }
     } else {
       dout(10) << "pushed " << soid << ", still waiting for push ack from " 
               << pushing[soid] << dendl;
@@ -3702,14 +3728,12 @@ void ReplicatedPG::on_role_change()
   dout(10) << "on_role_change" << dendl;
 
   // take object waiters
-  for (hash_map<sobject_t, list<Message*> >::iterator it = waiting_for_missing_object.begin();
-       it != waiting_for_missing_object.end();
-       it++)
-    osd->take_waiters(it->second);
-  waiting_for_missing_object.clear();
+  take_object_waiters(waiting_for_missing_object);
+  take_object_waiters(waiting_for_degraded_object);
 }
 
 
+
 // clear state.  called on recovery completion AND cancellation.
 void ReplicatedPG::_clear_recovery_state()
 {
@@ -3871,6 +3895,38 @@ int ReplicatedPG::recover_primary(int max)
   return started;
 }
 
+int ReplicatedPG::recover_object_replicas(const sobject_t& soid)
+{
+  int started = 0;
+
+  dout(10) << "recover_object_replicas " << soid << dendl;
+
+  ObjectContext *obc = lookup_object_context(soid);
+  if (obc) {
+    dout(10) << " ondisk_read_lock for " << soid << dendl;
+    obc->ondisk_read_lock();
+  }
+  
+  start_recovery_op(soid);
+  started++;
+
+  // who needs it?  
+  for (unsigned i=1; i<acting.size(); i++) {
+    int peer = acting[i];
+    if (peer_missing.count(peer) &&
+       peer_missing[peer].is_missing(soid)) 
+      push_to_replica(soid, peer);
+  }
+  
+  if (obc) {
+    dout(10) << " ondisk_read_unlock on " << soid << dendl;
+    obc->ondisk_read_unlock();
+    put_object_context(obc);
+  }
+
+  return started;
+}
+
 int ReplicatedPG::recover_replicas(int max)
 {
   int started = 0;
@@ -3891,30 +3947,7 @@ int ReplicatedPG::recover_replicas(int max)
     sobject_t soid = peer_missing[peer].rmissing.begin()->second;
     eversion_t v = peer_missing[peer].rmissing.begin()->first;
 
-    ObjectContext *obc = lookup_object_context(soid);
-    if (obc) {
-      dout(10) << " ondisk_read_lock for " << soid << dendl;
-      obc->ondisk_read_lock();
-    }
-
-    start_recovery_op(soid);
-    started++;
-
-    push_to_replica(soid, peer);
-
-    // do other peers need it too?
-    for (i++; i<acting.size(); i++) {
-      int peer = acting[i];
-      if (peer_missing.count(peer) &&
-          peer_missing[peer].is_missing(soid)) 
-       push_to_replica(soid, peer);
-    }
-
-    if (obc) {
-      dout(10) << " ondisk_read_unlock on " << soid << dendl;
-      obc->ondisk_read_unlock();
-      put_object_context(obc);
-    }
+    started += recover_object_replicas(soid);
 
     if (started >= max)
       return started;
index fda5c31b487357e770f99a8edd78732f7652928f..29eda2721564180b79ba36e17fefd79bc21c1e97 100644 (file)
@@ -458,6 +458,7 @@ protected:
   // push
   map<sobject_t, set<int> > pushing;
 
+  int recover_object_replicas(const sobject_t& soid);
   void calc_head_subsets(SnapSet& snapset, const sobject_t& head,
                         Missing& missing,
                         interval_set<uint64_t>& data_subset,
@@ -594,6 +595,9 @@ public:
   bool is_missing_object(const sobject_t& oid);
   void wait_for_missing_object(const sobject_t& oid, Message *op);
 
+  bool is_degraded_object(const sobject_t& oid);
+  void wait_for_degraded_object(const sobject_t& oid, Message *op);
+
   void on_osd_failure(int o);
   void on_acker_change();
   void on_role_change();