]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: always share past_intervals
authorSage Weil <sage.weil@dreamhost.com>
Sat, 28 Apr 2012 22:49:40 +0000 (15:49 -0700)
committerSage Weil <sage.weil@dreamhost.com>
Sat, 28 Apr 2012 22:49:40 +0000 (15:49 -0700)
Share past intervals when starting up new replicas.  This can happen via
an MOSDPGInfo or an MOSDPGLog message.

Fix up get_or_create_pg() so the past_intervals arg is required (and a ref,
like the other args). Fix doxygen comment.

Now the only time generate_past_intervals() should do any work is when
upgrading old clusters, during pg creation, and (possibly) during pg
split (when that is fully implemented).

Signed-off-by: Sage Weil <sage.weil@dreamhost.com>
src/messages/MOSDPGInfo.h
src/messages/MOSDPGLog.h
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc

index a79a6e66c7a06fa7c4e81c7d52e6b4a280335d60..d39b05b679ba5b68de864c85a8b5f6de55dc7d9e 100644 (file)
 #include "osd/osd_types.h"
 
 class MOSDPGInfo : public Message {
+  static const int HEAD_VERSION = 2;
+  static const int COMPAT_VERSION = 1;
+
   epoch_t epoch;
 
 public:
-  vector<pg_info_t> pg_info;
+  vector<pair<pg_info_t,pg_interval_map_t> > pg_list;
 
   epoch_t get_epoch() { return epoch; }
 
-  MOSDPGInfo() : Message(MSG_OSD_PG_INFO) {}
-  MOSDPGInfo(version_t mv) :
-    Message(MSG_OSD_PG_INFO),
-    epoch(mv) { }
+  MOSDPGInfo()
+    : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION) {}
+  MOSDPGInfo(version_t mv)
+    : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION),
+      epoch(mv) { }
 private:
   ~MOSDPGInfo() {}
 
 public:
   const char *get_type_name() const { return "pg_info"; }
   void print(ostream& out) const {
-    out << "pg_info(" << pg_info.size() << " pgs e" << epoch << ")";
+    out << "pg_info(" << pg_list.size() << " pgs e" << epoch << ":";
+
+    for (vector<pair<pg_info_t,pg_interval_map_t> >::const_iterator i = pg_list.begin();
+         i != pg_list.end();
+         ++i) {
+      if (i != pg_list.begin())
+       out << ",";
+      out << i->first.pgid;
+      if (i->second.size())
+       out << "(" << i->second.size() << ")";
+    }
+
+    out << ")";
   }
 
   void encode_payload(uint64_t features) {
     ::encode(epoch, payload);
-    ::encode(pg_info, payload);
+
+    // v1 was vector<pg_info_t>
+    __u32 n = pg_list.size();
+    ::encode(n, payload);
+    for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin();
+        p != pg_list.end();
+        p++)
+      ::encode(p->first, payload);
+
+    // v2 needs the pg_interval_map_t for each record
+    for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin();
+        p != pg_list.end();
+        p++)
+      ::encode(p->second, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
     ::decode(epoch, p);
-    ::decode(pg_info, p);
+
+    // decode pg_info_t portion of the vector
+    __u32 n;
+    ::decode(n, p);
+    pg_list.resize(n);
+    for (unsigned i=0; i<n; i++)
+      ::decode(pg_list[i].first, p);
+
+    if (header.version >= 2) {
+      // get the pg_interval_map_t portion
+      for (unsigned i=0; i<n; i++)
+       ::decode(pg_list[i].second, p);
+    }
   }
 };
 
index bf4297d39d5f7243d96bf7a5e40ce529c139ab78..b296fb3997d393ae154847c028a19129cc5be09c 100644 (file)
@@ -20,7 +20,8 @@
 
 class MOSDPGLog : public Message {
 
-  static const int HEAD_VERSION = 2;
+  static const int HEAD_VERSION = 3;
+  static const int COMPAT_VERSION = 2;
 
   epoch_t epoch;
   /// query_epoch is the epoch of the query being responded to, or
@@ -33,17 +34,18 @@ public:
   pg_info_t info;
   pg_log_t log;
   pg_missing_t missing;
+  pg_interval_map_t past_intervals;
 
   epoch_t get_epoch() { return epoch; }
   pg_t get_pgid() { return info.pgid; }
   epoch_t get_query_epoch() { return query_epoch; }
 
-  MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION) { }
+  MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION) { }
   MOSDPGLog(version_t mv, pg_info_t& i)
-    : Message(MSG_OSD_PG_LOG, HEAD_VERSION),
+    : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION),
       epoch(mv), query_epoch(mv), info(i)  { }
   MOSDPGLog(version_t mv, pg_info_t& i, epoch_t query_epoch)
-    : Message(MSG_OSD_PG_LOG, HEAD_VERSION),
+    : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION),
       epoch(mv), query_epoch(query_epoch), info(i)  { }
 
 private:
@@ -62,6 +64,7 @@ public:
     ::encode(log, payload);
     ::encode(missing, payload);
     ::encode(query_epoch, payload);
+    ::encode(past_intervals, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
@@ -72,6 +75,9 @@ public:
     if (header.version >= 2) {
       ::decode(query_epoch, p);
     }
+    if (header.version >= 3) {
+      ::decode(past_intervals, p);
+    }
   }
 };
 
index d6fc3a7ee61315e7caa6592acaf395cd4a16d429..203ac047ddfe7758f75f36c60be08d87ece01f75 100644 (file)
@@ -1113,7 +1113,7 @@ PG *OSD::_open_lock_pg(pg_t pgid, bool no_lockdep_check, bool hold_map_lock)
 
 PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
                         int role, vector<int>& up, vector<int>& acting, pg_history_t history,
-                        pg_interval_map_t *pim,
+                        pg_interval_map_t& pi,
                         ObjectStore::Transaction& t)
 {
   assert(osd_lock.is_locked());
@@ -1133,7 +1133,7 @@ PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
     history.last_epoch_started = history.epoch_created - 1;
   }
 
-  pg->init(role, up, acting, history, pim, &t);
+  pg->init(role, up, acting, history, pi, &t);
 
   dout(7) << "_create_lock_pg " << *pg << dendl;
   return pg;
@@ -1239,8 +1239,8 @@ void OSD::load_pgs()
  * look up a pg.  if we have it, great.  if not, consider creating it IF the pg mapping
  * hasn't changed since the given epoch and we are the primary.
  */
-PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& created,
-                         bool primary, pg_interval_map_t *ppi,
+PG *OSD::get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi,
+                         epoch_t epoch, int from, int& created, bool primary,
                          ObjectStore::Transaction **pt,
                          C_Contexts **pfin)
 {
@@ -1289,7 +1289,7 @@ PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& c
     // ok, create PG locally using provided Info and History
     *pt = new ObjectStore::Transaction;
     *pfin = new C_Contexts(g_ceph_context);
-    pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, ppi, **pt);
+    pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, pi, **pt);
       
     created++;
     dout(10) << *pg << " is new" << dendl;
@@ -3896,8 +3896,9 @@ void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction&
     history.epoch_created = history.same_up_since =
       history.same_interval_since = history.same_primary_since =
       osdmap->get_epoch();
+    pg_interval_map_t pi;
     PG *pg = _create_lock_pg(*q, true, true,
-                            parent->get_role(), parent->up, parent->acting, history, NULL, t);
+                            parent->get_role(), parent->up, parent->acting, history, pi, t);
     children[*q] = pg;
     dout(10) << "  child " << *pg << dendl;
   }
@@ -4129,9 +4130,9 @@ void OSD::handle_pg_create(OpRequestRef op)
     if (can_create_pg(pgid)) {
       ObjectStore::Transaction *t = new ObjectStore::Transaction;
       C_Contexts *fin = new C_Contexts(g_ceph_context);
-
+      pg_interval_map_t pi;
       PG *pg = _create_lock_pg(pgid, true, false,
-                              0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, NULL,
+                              0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, pi,
                               *t);
       creating_pgs.erase(pgid);
 
@@ -4207,8 +4208,8 @@ void OSD::do_infos(map<int,MOSDPGInfo*>& info_map)
   for (map<int,MOSDPGInfo*>::iterator p = info_map.begin();
        p != info_map.end();
        ++p) { 
-    for (vector<pg_info_t>::iterator i = p->second->pg_info.begin();
-        i != p->second->pg_info.end();
+    for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator i = p->second->pg_list.begin();
+        i != p->second->pg_list.end();
         ++i) {
       dout(20) << "Sending info " << *i << " to osd." << p->first << dendl;
     }
@@ -4250,7 +4251,7 @@ void OSD::handle_pg_notify(OpRequestRef op)
 
     ObjectStore::Transaction *t;
     C_Contexts *fin;
-    pg = get_or_create_pg(it->first, m->get_epoch(), from, created, true, &it->second, &t, &fin);
+    pg = get_or_create_pg(it->first, it->second, m->get_epoch(), from, created, true, &t, &fin);
     if (!pg)
       continue;
 
@@ -4291,8 +4292,8 @@ void OSD::handle_pg_log(OpRequestRef op)
   int created = 0;
   ObjectStore::Transaction *t;
   C_Contexts *fin;  
-  PG *pg = get_or_create_pg(m->info, m->get_epoch(), 
-                           from, created, false, NULL, &t, &fin);
+  PG *pg = get_or_create_pg(m->info, m->past_intervals, m->get_epoch(), 
+                           from, created, false, &t, &fin);
   if (!pg) {
     return;
   }
@@ -4339,13 +4340,13 @@ void OSD::handle_pg_info(OpRequestRef op)
 
   int created = 0;
 
-  for (vector<pg_info_t>::iterator p = m->pg_info.begin();
-       p != m->pg_info.end();
+  for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = m->pg_list.begin();
+       p != m->pg_list.end();
        ++p) {
     ObjectStore::Transaction *t = 0;
     C_Contexts *fin = 0;
-    PG *pg = get_or_create_pg(*p, m->get_epoch(), 
-                             from, created, false, NULL, &t, &fin);
+    PG *pg = get_or_create_pg(p->first, p->second, m->get_epoch(), 
+                             from, created, false, &t, &fin);
     if (!pg)
       continue;
 
@@ -4359,7 +4360,7 @@ void OSD::handle_pg_info(OpRequestRef op)
 
     PG::RecoveryCtx rctx(0, &info_map, 0, &fin->contexts, t);
 
-    pg->handle_info(from, *p, &rctx);
+    pg->handle_info(from, p->first, &rctx);
 
     int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin);
     assert(!tr);
index 8843a9ac714d79e8a57b426fddd7c8ad72654414..bfcaadf9e91afb8c683f8221ccf9d09b9a39235d 100644 (file)
@@ -471,13 +471,12 @@ protected:
   PG   *_open_lock_pg(pg_t pg, bool no_lockdep_check=false, bool hold_map_lock=false);
   PG   *_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
                        int role, vector<int>& up, vector<int>& acting, pg_history_t history,
-                       pg_interval_map_t *pim,
-                       ObjectStore::Transaction& t);
+                       pg_interval_map_t& pi, ObjectStore::Transaction& t);
 
   PG *lookup_lock_raw_pg(pg_t pgid);
 
-  PG *get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& pcreated, bool primary,
-                      pg_interval_map_t *ppi,
+  PG *get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi,
+                      epoch_t epoch, int from, int& pcreated, bool primary,
                       ObjectStore::Transaction **pt,
                       C_Contexts **pfin);
   
index 9da82510dc8ef2571b61bc7154228a8e324e366d..7c27ed7f9926eb5bda58fa33f4519c996d57b741 100644 (file)
@@ -1301,7 +1301,7 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
          dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
          if (activator_map->count(peer) == 0)
            (*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch());
-         (*activator_map)[peer]->pg_info.push_back(info);
+         (*activator_map)[peer]->pg_list.push_back(make_pair(info, past_intervals));
        } else {
          dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
          m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
@@ -1334,12 +1334,15 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
        m->log.copy_after(log, pi.last_update);
       }
 
+      // share past_intervals if we are creating the pg on the replica
+      if (pi.dne())
+       m->past_intervals = past_intervals;
+
       if (pi.last_backfill != hobject_t::get_max())
        state_set(PG_STATE_BACKFILL);
       else
        active++;
 
-
       // update local version of peer's missing list!
       if (m && pi.last_backfill != hobject_t()) {
         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
@@ -1484,7 +1487,7 @@ void PG::_activate_committed(epoch_t e, entity_inst_t& primary)
     MOSDPGInfo *m = new MOSDPGInfo(e);
     pg_info_t i = info;
     i.history.last_epoch_started = e;
-    m->pg_info.push_back(i);
+    m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
     osd->cluster_messenger->send_message(m, primary);
   }
   unlock();
@@ -1815,20 +1818,24 @@ void PG::clear_stats()
  * @param newup up set
  * @param newacting acting set
  * @param history pg history
+ * @param pi past_intervals
  * @param t transaction to write out our new state in
  */
 void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t& history,
-             pg_interval_map_t *pim,
+             pg_interval_map_t& pi,
              ObjectStore::Transaction *t)
 {
   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
-          << " history " << history << dendl;
+          << " history " << history
+          << " " << pi.size() << " past_intervals"
+          << dendl;
 
   set_role(role);
   acting = newacting;
   up = newup;
 
   info.history = history;
+  past_intervals.swap(pi);
 
   info.stats.up = up;
   info.stats.acting = acting;
@@ -1836,11 +1843,6 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t
 
   osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
 
-  if (pim) {
-    past_intervals.swap(*pim);
-    dout(10) << "init  got " << past_intervals.size() << " past intervals" << dendl;
-  }
-
   write_info(*t);
   write_log(*t);
 }
@@ -3276,7 +3278,7 @@ void PG::share_pg_info()
   for (unsigned i=1; i<acting.size(); i++) {
     int peer = acting[i];
     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
-    m->pg_info.push_back(info);
+    m->pg_list.push_back(make_pair(info, pg_interval_map_t()));
     osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer));
   }
 }
index 46d988949c983106aae71457f25d2ed3b5456e0b..c9975f04f3723b789bb27c384b42ec970de627c5 100644 (file)
@@ -1291,7 +1291,7 @@ public:
   bool  is_empty() const { return info.last_update == eversion_t(0,0); }
 
   void init(int role, vector<int>& up, vector<int>& acting, pg_history_t& history,
-           pg_interval_map_t *pim, ObjectStore::Transaction *t);
+           pg_interval_map_tpim, ObjectStore::Transaction *t);
 
   // pg on-disk state
   void do_pending_flush();
index 69f0ed2a757259a4e59f6a2eca7c652a66b3de68..f7c073f15e33dca6d5e68fe63c7ddd4a14b17121 100644 (file)
@@ -6480,7 +6480,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed)
       // tell replicas
       for (unsigned i=1; i<acting.size(); i++) {
        MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
-       m->pg_info.push_back(info);
+       m->pg_list.push_back(make_pair(info, pg_interval_map_t()));
        osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(acting[i]));
       }
     }