From: Sage Weil Date: Thu, 18 Jun 2009 18:40:55 +0000 (-0700) Subject: osd: trim pg logs on recovery completion X-Git-Tag: v0.9~32 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fffc9f38557f68735dcc174c2e8dcb8b52955f72;p=ceph.git osd: trim pg logs on recovery completion When replica finds itself fully up to date (last_complete == last_update) it tells the primary. Primary checks the same. If the primary find the min_last_complete_ondisk has changed, it sends out a trim command. This will let us drop huge pg logs out of memory after a recovery without waiting for IO and the usual piggybacked trimming logic to kick in. --- diff --git a/src/messages/MOSDPGTrim.h b/src/messages/MOSDPGTrim.h index c3139f691ddb..e6aaaf67a11b 100644 --- a/src/messages/MOSDPGTrim.h +++ b/src/messages/MOSDPGTrim.h @@ -12,7 +12,6 @@ * */ - #ifndef __MOSDPGTRIM_H #define __MOSDPGTRIM_H diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 75b2e382ef7f..93a74aaba484 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3076,14 +3076,23 @@ void OSD::handle_pg_trim(MOSDPGTrim *m) goto out; } assert(pg); - assert(from == pg->acting[0]); - ObjectStore::Transaction t; - pg->trim(t, m->trim_to); - pg->write_info(t); + if (pg->is_primary()) { + // peer is informing us of their last_complete_ondisk + dout(10) << *pg << " replica osd" << from << " lcod " << m->trim_to << dendl; + pg->peer_last_complete_ondisk[from] = m->trim_to; + if (pg->calc_min_last_complete_ondisk()) { + dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl; + pg->trim_peers(); + } + } else { + // primary is instructing us to trim + ObjectStore::Transaction t; + pg->trim(t, m->trim_to); + pg->write_info(t); + store->apply_transaction(t); + } pg->unlock(); - - store->apply_transaction(t); } out: diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 5719ec1c8253..d65a1880ee6f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -25,6 +25,7 @@ #include "messages/MOSDPGLog.h" #include "messages/MOSDPGRemove.h" #include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGTrim.h" #include "messages/MOSDSubOp.h" #include "messages/MOSDSubOpReply.h" @@ -1742,6 +1743,14 @@ void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) t.zero(0, log_oid, 0, ondisklog.bottom & ~4095); } +void PG::trim_peers() +{ + dout(10) << "trim_peers" << dendl; + for (unsigned i=1; imessenger->send_message(new MOSDPGTrim(osd->osdmap->get_epoch(), info.pgid, min_last_complete_ondisk), + osd->osdmap->get_inst(acting[i])); +} + void PG::add_log_entry(Log::Entry& e, bufferlist& log_bl) { diff --git a/src/osd/PG.h b/src/osd/PG.h index 9cfb7ed6905a..336f4ac9a8a7 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -403,6 +403,7 @@ public: void trim(ObjectStore::Transaction &t, eversion_t s); void trim_write_ahead(eversion_t last_update); + ostream& print(ostream& out) const; }; @@ -734,6 +735,8 @@ public: if (a < min) min = a; } + if (min == min_last_complete_ondisk) + return false; min_last_complete_ondisk = min; return true; } @@ -857,6 +860,7 @@ public: void read_log(ObjectStore *store); void trim(ObjectStore::Transaction& t, eversion_t v); void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); + void trim_peers(); void read_state(ObjectStore *store); coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index a71bf2b9fbf6..033471fdc2b4 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -2567,6 +2567,20 @@ void ReplicatedPG::_committed(epoch_t same_since, eversion_t last_complete) if (same_since == info.history.same_since) { dout(10) << "_committed last_complete " << last_complete << " now ondisk" << dendl; last_complete_ondisk = last_complete; + + if (last_complete_ondisk == info.last_update) { + if (is_replica()) { + // we are fully up to date. tell the primary! + osd->messenger->send_message(new MOSDPGTrim(osd->osdmap->get_epoch(), info.pgid, + last_complete_ondisk), + osd->osdmap->get_inst(get_primary())); + } else if (is_primary()) { + // we are the primary. tell replicas to trim? + if (calc_min_last_complete_ondisk()) + trim_peers(); + } + } + } else { dout(10) << "_committed pg has changed, not touching last_complete_ondisk" << dendl; } @@ -2993,7 +3007,6 @@ int ReplicatedPG::recover_primary(int max) if (is_all_uptodate()) { dout(-7) << "recover_primary complete" << dendl; finish_recovery(); - trim_replicas(); } else { dout(-10) << "recover_primary primary now complete, starting peer recovery" << dendl; } @@ -3040,7 +3053,6 @@ int ReplicatedPG::recover_replicas(int max) if (is_all_uptodate()) { finish_recovery(); - trim_replicas(); } else { dout(10) << "recover_replicas not all uptodate, acting " << acting << ", uptodate " << uptodate_set << dendl; } @@ -3049,21 +3061,6 @@ int ReplicatedPG::recover_replicas(int max) } -void ReplicatedPG::trim_replicas() -{ - dout(10) << "trim_replicas" << dendl; - - return; // hmm FIXME - - - // trim myself - eversion_t trim_to; - - for (unsigned i=1; imessenger->send_message(new MOSDPGTrim(osd->osdmap->get_epoch(), info.pgid, trim_to), - osd->osdmap->get_inst(acting[i])); -} - /** clean_up_local * remove any objects that we're storing but shouldn't. diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 8c0df019e8f8..7f6b6e35473f 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -418,7 +418,6 @@ protected: void finish_recovery_op(); int recover_primary(int max); int recover_replicas(int max); - void trim_replicas(); void sub_op_modify(MOSDSubOp *op); void sub_op_modify_ondisk(MOSDSubOp *op, int ackerosd, eversion_t last_complete);