]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: track same_up_since, same_acting_since, same_primary_since
authorSage Weil <sage@newdream.net>
Thu, 6 Aug 2009 23:53:01 +0000 (16:53 -0700)
committerSage Weil <sage@newdream.net>
Mon, 10 Aug 2009 20:14:31 +0000 (13:14 -0700)
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc
src/osd/osd_types.h

index c6426626d41a62d6669aa34eb9375551179eb62e..84f9f163887b0e1636e69b9fff9f93ffcb6df5e3 100644 (file)
@@ -682,7 +682,8 @@ PG *OSD::_create_lock_new_pg(pg_t pgid, vector<int>& acting, ObjectStore::Transa
   pg->up = pg->acting;
   pg->info.history.epoch_created = 
     pg->info.history.last_epoch_started =
-    pg->info.history.same_since =
+    pg->info.history.same_up_since =
+    pg->info.history.same_acting_since =
     pg->info.history.same_primary_since = osdmap->get_epoch();
 
   pg->write_info(t);
@@ -840,7 +841,7 @@ void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& ps
  * up until now
  */
 void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from,
-                            vector<int>& last)
+                            vector<int>& lastup, vector<int>& lastacting)
 {
   dout(15) << "project_pg_history " << pgid
            << " from " << from << " to " << osdmap->get_epoch()
@@ -853,26 +854,31 @@ void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from,
     // verify during intermediate epoch (e-1)
     OSDMap *oldmap = get_map(e-1);
 
-    vector<int> acting;
-    oldmap->pg_to_acting_osds(pgid, acting);
+    vector<int> up, acting;
+    oldmap->pg_to_up_acting_osds(pgid, up, acting);
 
     // acting set change?
-    if (acting != last && 
-        e > h.same_since) {
+    if (acting != lastacting && e > h.same_acting_since) {
+      dout(15) << "project_pg_history " << pgid << " changed in " << e 
+                << " from " << acting << " -> " << lastacting << dendl;
+      h.same_acting_since = e;
+    }
+    // up set change?
+    if (up != lastup && e > h.same_up_since) {
       dout(15) << "project_pg_history " << pgid << " changed in " << e 
-                << " from " << acting << " -> " << last << dendl;
-      h.same_since = e;
+                << " from " << up << " -> " << lastup << dendl;
+      h.same_up_since = e;
     }
 
     // primary change?
-    if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) &&
+    if (!(!acting.empty() && !lastacting.empty() && acting[0] == lastacting[0]) &&
         e > h.same_primary_since) {
       dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
       h.same_primary_since = e;
     }
 
-    if (h.same_since >= e &&
-        h.same_primary_since >= e) break;
+    if (h.same_acting_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
+      break;
   }
 
   dout(15) << "project_pg_history end " << h << dendl;
@@ -2154,27 +2160,31 @@ void OSD::advance_map(ObjectStore::Transaction& t)
     pg->up.swap(tup);
     pg->set_role(role);
     
-    // did acting, primary|acker change?
-    if (tacting != pg->acting) {
+    // did acting, up, primary|acker change?
+    if (tacting != pg->acting || tup != pg->up) {
       // remember past interval
-      PG::Interval& i = pg->past_intervals[pg->info.history.same_since];
+      PG::Interval& i = pg->past_intervals[pg->info.history.same_acting_since];
+      i.first = pg->info.history.same_acting_since;
+      i.last = osdmap->get_epoch() - 1;
+
       i.acting = oldacting;
       i.up = oldup;
-      i.first = pg->info.history.same_since;
-      i.last = osdmap->get_epoch() - 1;
+      if (tacting != pg->acting)
+       pg->info.history.same_acting_since = osdmap->get_epoch();
+      if (tup != pg->up)
+       pg->info.history.same_up_since = osdmap->get_epoch();
+
       if (i.acting.size())
        i.maybe_went_rw = 
          lastmap->get_up_thru(i.acting[0]) >= i.first &&
          lastmap->get_up_from(i.acting[0]) <= i.first;
       else
        i.maybe_went_rw = 0;
-      dout(10) << *pg << " noting past " << i << dendl;
 
-      pg->info.history.same_since = osdmap->get_epoch();
-      pg->dirty_info = true;
-    }
-    if (oldprimary != pg->get_primary()) {
-      pg->info.history.same_primary_since = osdmap->get_epoch();
+      if (oldprimary != pg->get_primary())
+       pg->info.history.same_primary_since = osdmap->get_epoch();
+
+      dout(10) << *pg << " noting past " << i << dendl;
       pg->dirty_info = true;
     }
     pg->cancel_recovery();
@@ -2300,7 +2310,7 @@ void OSD::activate_map(ObjectStore::Transaction& t)
             !pg->is_active()) {
       // i am (inactive) primary
       if (!pg->is_peering() || 
-         (pg->need_up_thru && up_thru >= pg->info.history.same_since))
+         (pg->need_up_thru && up_thru >= pg->info.history.same_acting_since))
        pg->peer(t, query_map, &info_map);
     }
     else if (pg->is_stray() &&
@@ -2723,7 +2733,7 @@ void OSD::handle_pg_create(MOSDPGCreate *m)
 
     // figure history
     PG::Info::History history;
-    project_pg_history(pgid, history, created, acting);
+    project_pg_history(pgid, history, created, up, acting);
     
     // register.
     creating_pgs[pgid].created = created;
@@ -2855,7 +2865,7 @@ void OSD::handle_pg_notify(MOSDPGNotify *m)
       int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
       PG::Info::History history = it->history;
-      project_pg_history(pgid, history, m->get_epoch(), acting);
+      project_pg_history(pgid, history, m->get_epoch(), up, acting);
 
       if (m->get_epoch() < history.same_primary_since) {
         dout(10) << "handle_pg_notify pg " << pgid << " primary changed in "
@@ -2976,8 +2986,8 @@ void OSD::_process_pg_info(epoch_t epoch, int from,
     osdmap->pg_to_up_acting_osds(info.pgid, up, acting);
     int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
-    project_pg_history(info.pgid, info.history, epoch, acting);
-    if (epoch < info.history.same_since) {
+    project_pg_history(info.pgid, info.history, epoch, up, acting);
+    if (epoch < info.history.same_acting_since) {
       dout(10) << "got old info " << info << " on non-existent pg, ignoring" << dendl;
       return;
     }
@@ -2996,7 +3006,7 @@ void OSD::_process_pg_info(epoch_t epoch, int from,
     created++;
   } else {
     pg = _lookup_lock_pg(info.pgid);
-    if (epoch < pg->info.history.same_since) {
+    if (epoch < pg->info.history.same_primary_since) {
       dout(10) << *pg << " got old info " << info << ", ignoring" << dendl;
       pg->unlock();
       return;
@@ -3107,7 +3117,7 @@ void OSD::handle_pg_trim(MOSDPGTrim *m)
     dout(10) << " don't have pg " << m->pgid << dendl;
   } else {
     PG *pg = _lookup_lock_pg(m->pgid);
-    if (m->epoch < pg->info.history.same_since) {
+    if (m->epoch < pg->info.history.same_primary_since) {
       dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
       pg->unlock();
       goto out;
@@ -3167,9 +3177,9 @@ void OSD::handle_pg_query(MOSDPGQuery *m)
 
       // same primary?
       PG::Info::History history = it->second.history;
-      project_pg_history(pgid, history, m->get_epoch(), acting);
+      project_pg_history(pgid, history, m->get_epoch(), up, acting);
 
-      if (m->get_epoch() < history.same_since) {
+      if (m->get_epoch() < history.same_primary_since) {
         dout(10) << " pg " << pgid << " dne, and pg has changed in "
                  << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl;
         continue;
@@ -3199,9 +3209,9 @@ void OSD::handle_pg_query(MOSDPGQuery *m)
       pg = _lookup_lock_pg(pgid);
       
       // same primary?
-      if (m->get_epoch() < pg->info.history.same_since) {
+      if (m->get_epoch() < pg->info.history.same_primary_since) {
         dout(10) << *pg << " handle_pg_query primary changed in "
-                 << pg->info.history.same_since
+                 << pg->info.history.same_primary_since
                  << " (msg from " << m->get_epoch() << ")" << dendl;
        pg->unlock();
         continue;
@@ -3285,14 +3295,14 @@ void OSD::handle_pg_remove(MOSDPGRemove *m)
     }
 
     pg = _lookup_lock_pg(pgid);
-    if (pg->info.history.same_since <= m->get_epoch()) {
+    if (pg->info.history.same_acting_since <= m->get_epoch()) {
       dout(10) << *pg << " removing." << dendl;
       assert(pg->get_role() == -1);
       assert(pg->get_primary() == m->get_source().num());
       _remove_unlock_pg(pg);
     } else {
       dout(10) << *pg << " ignoring remove request, pg changed in epoch "
-              << pg->info.history.same_since << " > " << m->get_epoch() << dendl;
+              << pg->info.history.same_acting_since << " > " << m->get_epoch() << dendl;
       pg->unlock();
     }
   }
@@ -3784,7 +3794,7 @@ void OSD::handle_sub_op(MOSDSubOp *op)
 
   // same pg?
   //  if pg changes _at all_, we reset and repeer!
-  if (op->map_epoch < pg->info.history.same_since) {
+  if (op->map_epoch < pg->info.history.same_acting_since) {
     dout(10) << "handle_sub_op pg changed " << pg->info.history
             << " after " << op->map_epoch 
             << ", dropping" << dendl;
index b53092cdc4b169f4837f0f52b1d183485038e3f2..65f305217cb03b271721057cffe25790e7c36ed0 100644 (file)
@@ -444,7 +444,7 @@ protected:
   void load_pgs();
   void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& pset);
   void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from,
-                         vector<int>& last);
+                         vector<int>& lastup, vector<int>& lastacting);
 
   void wake_pg_waiters(pg_t pgid) {
     if (waiting_for_pg.count(pgid)) {
index 4f2d23061e7dcf1499d6f5ae8619a7cf59419754..a88d1ad5d2ba6811d2140f331d712e535f3c7e7a 100644 (file)
@@ -708,7 +708,7 @@ void PG::generate_past_intervals()
 {
   epoch_t first_epoch = 0;
   epoch_t stop = MAX(1, info.history.last_epoch_started);
-  epoch_t last_epoch = info.history.same_since - 1;
+  epoch_t last_epoch = info.history.same_acting_since - 1;
 
   dout(10) << "generate_past_intervals over epochs " << stop << "-" << last_epoch << dendl;
 
@@ -871,7 +871,7 @@ void PG::build_prior()
   bool some_down = false;
 
   // generate past intervals, if we don't have them.
-  if (info.history.same_since > info.history.last_epoch_started &&
+  if (info.history.same_acting_since > info.history.last_epoch_started &&
       (past_intervals.empty() ||
        past_intervals.begin()->first > info.history.last_epoch_started))
     generate_past_intervals();
@@ -1204,6 +1204,22 @@ void PG::peer(ObjectStore::Transaction& t,
     return;
   }
 
+  // do i need a backlog for an up peer excluded from acting?
+  bool need_backlog = false;
+  for (unsigned i=0; i<up.size(); i++) {
+    int o = up[i];
+    if (o == osd->whoami || is_acting(o))
+      continue;
+    Info& pi = peer_info[o];
+    if (pi.last_update < log.tail && !log.backlog) {
+      dout(10) << "must generate backlog for !acting peer osd" << o
+              << " whose last_update " << pi.last_update << " < my log.tail " << log.tail << dendl;
+      need_backlog = true;
+    }
+  }
+  if (need_backlog)
+    osd->queue_generate_backlog(this);
+
 
   /** COLLECT MISSING+LOG FROM PEERS **********/
   /*
@@ -1305,16 +1321,16 @@ void PG::peer(ObjectStore::Transaction& t,
 
   // -- do need to notify the monitor?
   if (must_notify_mon) {
-    if (osd->osdmap->get_up_thru(osd->whoami) < info.history.same_since) {
+    if (osd->osdmap->get_up_thru(osd->whoami) < info.history.same_acting_since) {
       dout(10) << "up_thru " << osd->osdmap->get_up_thru(osd->whoami)
-              << " < same_since " << info.history.same_since
+              << " < same_since " << info.history.same_acting_since
               << ", must notify monitor" << dendl;
       need_up_thru = true;
-      osd->queue_want_up_thru(info.history.same_since);
+      osd->queue_want_up_thru(info.history.same_acting_since);
       return;
     } else {
       dout(10) << "up_thru " << osd->osdmap->get_up_thru(osd->whoami)
-              << " >= same_since " << info.history.same_since
+              << " >= same_since " << info.history.same_acting_since
               << ", all is well" << dendl;
     }
   }
@@ -2137,9 +2153,9 @@ void PG::sub_op_scrub_reply(MOSDSubOpReply *op)
 {
   dout(7) << "sub_op_scrub_reply" << dendl;
 
-  if (op->map_epoch < info.history.same_primary_since) {
+  if (op->map_epoch < info.history.same_acting_since) {
     dout(10) << "sub_op_scrub discarding old sub_op from "
-            << op->map_epoch << " < " << info.history.same_primary_since << dendl;
+            << op->map_epoch << " < " << info.history.same_acting_since << dendl;
     delete op;
     return;
   }
@@ -2251,7 +2267,7 @@ void PG::scrub()
   osd->map_lock.get_read();
   lock();
  
-  epoch_t epoch = info.history.same_since;
+  epoch_t epoch = info.history.same_acting_since;
 
   if (!is_primary()) {
     dout(10) << "scrub -- not primary" << dendl;
@@ -2304,7 +2320,7 @@ void PG::scrub()
 
   /*
   lock();
-  if (epoch != info.history.same_since) {
+  if (epoch != info.history.same_acting_since) {
     dout(10) << "scrub  pg changed, aborting" << dendl;
     goto out;
   }
@@ -2315,7 +2331,7 @@ void PG::scrub()
             << " maps, waiting" << dendl;
     wait();
 
-    if (epoch != info.history.same_since ||
+    if (epoch != info.history.same_acting_since ||
        osd->is_stopping()) {
       dout(10) << "scrub  pg changed, aborting" << dendl;
       goto out;
@@ -2447,7 +2463,7 @@ void PG::scrub()
 
   /*
   lock();
-  if (epoch != info.history.same_since) {
+  if (epoch != info.history.same_acting_since) {
     dout(10) << "scrub  pg changed, aborting" << dendl;
     goto out;
   }
@@ -2465,7 +2481,7 @@ void PG::scrub()
 
   /*
   lock();
-  if (epoch != info.history.same_since) {
+  if (epoch != info.history.same_acting_since) {
     dout(10) << "scrub  pg changed, aborting" << dendl;
     goto out;
   }
index f139d2f12a888279e0f4484c6b5f31e746a45ef9..23732dc35eb76f19e6ce7966b16776e815efacda 100644 (file)
@@ -99,12 +99,13 @@ public:
       epoch_t epoch_created;       // epoch in which PG was created
       epoch_t last_epoch_started;  // lower bound on last epoch started (anywhere, not necessarily locally)
 
-      epoch_t same_since;          // same acting set since
+      epoch_t same_up_since;       // same acting set since
+      epoch_t same_acting_since;   // same acting set since
       epoch_t same_primary_since;  // same primary at least back through this epoch.
       History() :            
        epoch_created(0),
        last_epoch_started(0),
-       same_since(0), same_primary_since(0) {}
+       same_up_since(0), same_acting_since(0), same_primary_since(0) {}
 
       void merge(const History &other) {
        if (epoch_created < other.epoch_created)
@@ -116,13 +117,16 @@ public:
       void encode(bufferlist &bl) const {
        ::encode(epoch_created, bl);
        ::encode(last_epoch_started, bl);
-       ::encode(same_since, bl);
+       ::encode(same_up_since, bl);
+       ::encode(same_acting_since, bl);
        ::encode(same_primary_since, bl);
       }
-      void decode(bufferlist::iterator &bl) {
+      void decode(bufferlist::iterator &bl, int v=0) {
        ::decode(epoch_created, bl);
        ::decode(last_epoch_started, bl);
-       ::decode(same_since, bl);
+       if (v && v >= 20)
+         ::decode(same_up_since, bl);
+       ::decode(same_acting_since, bl);
        ::decode(same_primary_since, bl);
       }
     } history;
@@ -135,7 +139,7 @@ public:
     bool dne() const { return history.epoch_created == 0; }
 
     void encode(bufferlist &bl) const {
-      __u8 v = CEPH_OSD_ONDISK_VERSION;
+      __u8 v = 20;
       ::encode(v, bl);
 
       ::encode(pgid, bl);
@@ -148,7 +152,7 @@ public:
       ::encode(snap_trimq, bl);
     }
     void decode(bufferlist::iterator &bl) {
-      __u8 v = CEPH_OSD_ONDISK_VERSION;
+      __u8 v;
       ::decode(v, bl);
 
       ::decode(pgid, bl);
@@ -157,7 +161,7 @@ public:
       ::decode(log_tail, bl);
       ::decode(log_backlog, bl);
       ::decode(stats, bl);
-      history.decode(bl);
+      history.decode(bl, v);
       ::decode(snap_trimq, bl);
     }
   };
@@ -929,7 +933,7 @@ inline ostream& operator<<(ostream& out, const PG::Info::History& h)
 {
   return out << "ec=" << h.epoch_created
             << " les=" << h.last_epoch_started
-            << " " << h.same_since << "/" << h.same_primary_since;
+            << " " << h.same_up_since << "/" << h.same_acting_since;
 }
 
 inline ostream& operator<<(ostream& out, const PG::Info& pgi) 
index 3a5b95bd32802361c2e5e55b7637c1211380b09b..78232bbcff3f9550c211869740862fbcc45e4452 100644 (file)
@@ -2995,7 +2995,7 @@ struct C_OSD_Commit : public Context {
 
 void ReplicatedPG::_committed(epoch_t same_since, eversion_t last_complete)
 {
-  if (same_since == info.history.same_since) {
+  if (same_since == info.history.same_acting_since) {
     dout(10) << "_committed last_complete " << last_complete << " now ondisk" << dendl;
     last_complete_ondisk = last_complete;
 
@@ -3166,7 +3166,7 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op)
 
   // apply to disk!
   write_info(t);
-  unsigned r = osd->store->apply_transaction(t, new C_OSD_Commit(this, info.history.same_since,
+  unsigned r = osd->store->apply_transaction(t, new C_OSD_Commit(this, info.history.same_acting_since,
                                                                 info.last_complete));
   assert(r == 0);
 
index cb06e39a96c86c3782af8c3d64fb387c71310910..bc709b725f69535d1843f88e52460eb775cf4abb 100644 (file)
@@ -24,7 +24,6 @@
 
 
 
-#define CEPH_OSD_ONDISK_VERSION 19
 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v020"