]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: trim pg logs on recovery completion
authorSage Weil <sage@newdream.net>
Thu, 18 Jun 2009 18:40:55 +0000 (11:40 -0700)
committerSage Weil <sage@newdream.net>
Thu, 18 Jun 2009 18:40:55 +0000 (11:40 -0700)
When replica finds itself fully up to date (last_complete ==
last_update) it tells the primary.  Primary checks the same.
If the primary find the min_last_complete_ondisk has changed,
it sends out a trim command.

This will let us drop huge pg logs out of memory after a recovery
without waiting for IO and the usual piggybacked trimming logic
to kick in.

src/messages/MOSDPGTrim.h
src/osd/OSD.cc
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h

index c3139f691ddbe2da2828cf1e03f32974493a3198..e6aaaf67a11be59268e1fe4cab81c6227ddfab70 100644 (file)
@@ -12,7 +12,6 @@
  * 
  */
 
-
 #ifndef __MOSDPGTRIM_H
 #define __MOSDPGTRIM_H
 
index 75b2e382ef7f85a8743d18aab2d855030f710266..93a74aaba4849530eb0dbfc9988547d36dfe0b0c 100644 (file)
@@ -3076,14 +3076,23 @@ void OSD::handle_pg_trim(MOSDPGTrim *m)
       goto out;
     }
     assert(pg);
-    assert(from == pg->acting[0]);
 
-    ObjectStore::Transaction t;
-    pg->trim(t, m->trim_to);
-    pg->write_info(t);
+    if (pg->is_primary()) {
+      // peer is informing us of their last_complete_ondisk
+      dout(10) << *pg << " replica osd" << from << " lcod " << m->trim_to << dendl;
+      pg->peer_last_complete_ondisk[from] = m->trim_to;
+      if (pg->calc_min_last_complete_ondisk()) {
+       dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl;
+       pg->trim_peers();
+      }
+    } else {
+      // primary is instructing us to trim
+      ObjectStore::Transaction t;
+      pg->trim(t, m->trim_to);
+      pg->write_info(t);
+      store->apply_transaction(t);
+    }
     pg->unlock();
-
-    store->apply_transaction(t);
   }
 
  out:
index 5719ec1c825301ee660c82bf7088773b4ea30d1c..d65a1880ee6f12fff89a9a705cba0da72da185b1 100644 (file)
@@ -25,6 +25,7 @@
 #include "messages/MOSDPGLog.h"
 #include "messages/MOSDPGRemove.h"
 #include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGTrim.h"
 
 #include "messages/MOSDSubOp.h"
 #include "messages/MOSDSubOpReply.h"
@@ -1742,6 +1743,14 @@ void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v)
     t.zero(0, log_oid, 0, ondisklog.bottom & ~4095);
 }
 
+void PG::trim_peers()
+{
+  dout(10) << "trim_peers" << dendl;
+  for (unsigned i=1; i<acting.size(); i++)
+    osd->messenger->send_message(new MOSDPGTrim(osd->osdmap->get_epoch(), info.pgid, min_last_complete_ondisk),
+                                osd->osdmap->get_inst(acting[i]));
+}
+
 
 void PG::add_log_entry(Log::Entry& e, bufferlist& log_bl)
 {
index 9cfb7ed6905a084e43d84224f9bcb877ca7532c6..336f4ac9a8a7168b9f5afa4e177a6dd1ddeb164c 100644 (file)
@@ -403,6 +403,7 @@ public:
     void trim(ObjectStore::Transaction &t, eversion_t s);
     void trim_write_ahead(eversion_t last_update);
 
+
     ostream& print(ostream& out) const;
   };
   
@@ -734,6 +735,8 @@ public:
       if (a < min)
        min = a;
     }
+    if (min == min_last_complete_ondisk)
+      return false;
     min_last_complete_ondisk = min;
     return true;
   }
@@ -857,6 +860,7 @@ public:
   void read_log(ObjectStore *store);
   void trim(ObjectStore::Transaction& t, eversion_t v);
   void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v);
+  void trim_peers();
 
   void read_state(ObjectStore *store);
   coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn);
index a71bf2b9fbf6d84a5a67b50a59dc666b18a0b613..033471fdc2b4663637b3948a4036b2c8ed629e5d 100644 (file)
@@ -2567,6 +2567,20 @@ void ReplicatedPG::_committed(epoch_t same_since, eversion_t last_complete)
   if (same_since == info.history.same_since) {
     dout(10) << "_committed last_complete " << last_complete << " now ondisk" << dendl;
     last_complete_ondisk = last_complete;
+
+    if (last_complete_ondisk == info.last_update) {
+      if (is_replica()) {
+       // we are fully up to date.  tell the primary!
+       osd->messenger->send_message(new MOSDPGTrim(osd->osdmap->get_epoch(), info.pgid,
+                                                   last_complete_ondisk),
+                                    osd->osdmap->get_inst(get_primary()));
+      } else if (is_primary()) {
+       // we are the primary.  tell replicas to trim?
+       if (calc_min_last_complete_ondisk())
+         trim_peers();
+      }
+    }
+
   } else {
     dout(10) << "_committed pg has changed, not touching last_complete_ondisk" << dendl;
   }
@@ -2993,7 +3007,6 @@ int ReplicatedPG::recover_primary(int max)
   if (is_all_uptodate()) {
     dout(-7) << "recover_primary complete" << dendl;
     finish_recovery();
-    trim_replicas();
   } else {
     dout(-10) << "recover_primary primary now complete, starting peer recovery" << dendl;
   }
@@ -3040,7 +3053,6 @@ int ReplicatedPG::recover_replicas(int max)
 
   if (is_all_uptodate()) {
     finish_recovery();
-    trim_replicas();
   } else {
     dout(10) << "recover_replicas not all uptodate, acting " << acting << ", uptodate " << uptodate_set << dendl;
   }
@@ -3049,21 +3061,6 @@ int ReplicatedPG::recover_replicas(int max)
 }
 
 
-void ReplicatedPG::trim_replicas()
-{
-  dout(10) << "trim_replicas" << dendl;
-
-  return;  // hmm FIXME
-
-
-  // trim myself
-  eversion_t trim_to;
-
-  for (unsigned i=1; i<acting.size(); i++)
-    osd->messenger->send_message(new MOSDPGTrim(osd->osdmap->get_epoch(), info.pgid, trim_to),
-                                osd->osdmap->get_inst(acting[i]));
-}
-
 
 /** clean_up_local
  * remove any objects that we're storing but shouldn't.
index 8c0df019e8f89e88c39e8b51113d5589287109a9..7f6b6e35473fea05e0b2abdb3b5cf504575a4a9b 100644 (file)
@@ -418,7 +418,6 @@ protected:
   void finish_recovery_op();
   int recover_primary(int max);
   int recover_replicas(int max);
-  void trim_replicas();
 
   void sub_op_modify(MOSDSubOp *op);
   void sub_op_modify_ondisk(MOSDSubOp *op, int ackerosd, eversion_t last_complete);