]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
OSD,PG: Move pg reset code from OSD::advance_map to PG
authorSamuel Just <samuel.just@dreamhost.com>
Fri, 22 Apr 2011 00:42:51 +0000 (17:42 -0700)
committerSamuel Just <samuel.just@dreamhost.com>
Tue, 3 May 2011 17:19:45 +0000 (10:19 -0700)
OSD::advance_map previously handled resetting the PG for peering.  Now,
PG::acting_up_affected returns true if peering needs to be restarted and
PG::warm_restart takes care of restting the pg.

src/osd/OSD.cc
src/osd/PG.cc
src/osd/PG.h

index 31a8c39ff6b776eebf738c9d8663c2852cdad027..4c23be8897d0d4a99fe71de9e95a1d4aebf3d203 100644 (file)
@@ -3202,167 +3202,19 @@ void OSD::advance_map(ObjectStore::Transaction& t)
     pg_t pgid = it->first;
     PG *pg = it->second;
 
-    // get new acting set
-    vector<int> tup, tacting;
-    osdmap->pg_to_up_acting_osds(pgid, tup, tacting);
-    int role = osdmap->calc_pg_role(whoami, tacting, tacting.size());
-
     pg->lock();
-
-    // adjust removed_snaps?
-    if (pg->is_active() &&
-       !pg->pool->newly_removed_snaps.empty()) {
-      pg->snap_trimq.union_of(pg->pool->newly_removed_snaps);
-      dout(10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
-      pg->dirty_info = true;
-    }
-    
-    // no change?
-    if (tacting == pg->acting && tup == pg->up) {
-      if ((pg->prior_set.get() == NULL) || (!pg->prior_set_affected(osdmap))) {
-       dout(15) << *pg << " unaffected with "
-         << tup << "/" << tacting << " up/acting" << dendl;
-       pg->unlock();
-       continue;
-      }
+    if (pg->handle_advance_map(*osdmap, *lastmap)) {
+      pg->unlock();
+      continue;
     }
-
-    // -- there was a change! --
-    int oldrole = pg->get_role();
-    int oldprimary = pg->get_primary();
-    vector<int> oldacting = pg->acting;
-    vector<int> oldup = pg->up;
-    
+       
     // make sure we clear out any pg_temp change requests
     pg_temp_wanted.erase(pgid);
-    
-    pg->kick();
     pg->prior_set.reset(NULL);
 
-    // update PG
-    pg->acting.swap(tacting);
-    pg->up.swap(tup);
-    pg->set_role(role);
     
-    // did acting, up, primary|acker change?
-    if (tacting != pg->acting || tup != pg->up) {
-      // remember past interval
-      PG::Interval& i = pg->past_intervals[pg->info.history.same_acting_since];
-      i.first = pg->info.history.same_acting_since;
-      i.last = osdmap->get_epoch() - 1;
-
-      i.acting = oldacting;
-      i.up = oldup;
-      if (tacting != pg->acting)
-       pg->info.history.same_acting_since = osdmap->get_epoch();
-      if (tup != pg->up)
-       pg->info.history.same_up_since = osdmap->get_epoch();
-
-      if (i.acting.size())
-       i.maybe_went_rw = 
-         (lastmap->get_up_thru(i.acting[0]) >= i.first &&
-          lastmap->get_up_from(i.acting[0]) <= i.first) ||
-         i.first == pg->info.history.epoch_created;
-      else
-       i.maybe_went_rw = 0;
-
-      if (oldprimary != pg->get_primary())
-       pg->info.history.same_primary_since = osdmap->get_epoch();
-
-      dout(10) << *pg << " noting past " << i << dendl;
-      pg->dirty_info = true;
-    }
     pg->cancel_recovery();
 
-    // deactivate.
-    pg->state_clear(PG_STATE_ACTIVE);
-    pg->state_clear(PG_STATE_DOWN);
-    pg->state_clear(PG_STATE_PEERING);  // we'll need to restart peering
-    pg->state_clear(PG_STATE_DEGRADED);
-    pg->state_clear(PG_STATE_REPLAY);
-
-    if (pg->is_primary()) {
-      if (osdmap->get_pg_size(pg->info.pgid) != pg->acting.size())
-       pg->state_set(PG_STATE_DEGRADED);
-    }
-
-    // reset primary state?
-    if (oldrole == 0 || pg->get_role() == 0)
-      pg->clear_primary_state();
-
-    dout(10) << *pg
-            << " up " << oldup << " -> " << pg->up 
-            << ", acting " << oldacting << " -> " << pg->acting 
-            << ", role " << oldrole << " -> " << role << dendl; 
-    
-    // pg->on_*
-    for (unsigned i=0; i<oldacting.size(); i++)
-      if (osdmap->is_down(oldacting[i]))
-       pg->on_osd_failure(oldacting[i]);
-    pg->on_change();
-
-    if (pg->deleting) {
-      dout(10) << *pg << " canceling deletion!" << dendl;
-      pg->deleting = false;
-      remove_wq.dequeue(pg);
-    }
-    
-    if (role != oldrole) {
-      // old primary?
-      if (oldrole == 0) {
-       pg->state_clear(PG_STATE_CLEAN);
-       pg->clear_stats();
-       
-       // take replay queue waiters
-       list<Message*> ls;
-       for (map<eversion_t,MOSDOp*>::iterator it = pg->replay_queue.begin();
-            it != pg->replay_queue.end();
-            it++)
-         ls.push_back(it->second);
-       pg->replay_queue.clear();
-       take_waiters(ls);
-      }
-
-      pg->on_role_change();
-
-      // interrupt backlog generation
-      cancel_generate_backlog(pg);
-
-      // take active waiters
-      take_waiters(pg->waiting_for_active);
-
-      // new primary?
-      if (role == 0) {
-       // i am new primary
-       pg->state_clear(PG_STATE_STRAY);
-      } else {
-       // i am now replica|stray.  we need to send a notify.
-       pg->state_set(PG_STATE_STRAY);
-       pg->have_master_log = false;
-      }
-      
-    } else {
-      // no role change.
-      // did primary change?
-      if (pg->get_primary() != oldprimary) {    
-       // we need to announce
-       pg->state_set(PG_STATE_STRAY);
-        
-       dout(10) << *pg << " " << oldacting << " -> " << pg->acting 
-                << ", acting primary " 
-                << oldprimary << " -> " << pg->get_primary() 
-                << dendl;
-      } else {
-       // primary is the same.
-       if (role == 0) {
-         // i am (still) primary. but my replica set changed.
-         pg->state_clear(PG_STATE_CLEAN);
-         
-         dout(10) << *pg << " " << oldacting << " -> " << pg->acting
-                  << ", replicas changed" << dendl;
-       }
-      }
-    }
 
     // sanity check pg_temp
     if (pg->acting.empty() && pg->up.size() && pg->up[0] == whoami) {
index d29c56ebdef4e68060e2a99fea06e5d083625a2f..96604c1e4b17b80d2a6f310bcb3e7dc1f16517a1 100644 (file)
@@ -3503,6 +3503,190 @@ void PG::share_pg_log(const eversion_t &oldver)
   }
 }
 
+bool PG::handle_advance_map(OSDMap &osdmap,
+                           const OSDMap &lastmap)
+{
+  if (acting_up_affected(osdmap, lastmap)) {
+    return true;
+  } else {
+    warm_restart();
+    return false;
+  }
+}
+
+bool PG::acting_up_affected(OSDMap &osdmap,
+                const OSDMap &lastmap)
+{
+  // get new acting set
+  vector<int> tup, tacting;
+  osdmap.pg_to_up_acting_osds(info.pgid, tup, tacting);
+
+  if (acting != tacting || up != tup) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/* Called before initializing peering during advance_map */
+void PG::warm_restart()
+{
+  OSDMap &lastmap = *osd->get_map(osd->osdmap->get_epoch() - 1);
+  OSDMap &osdmap = *osd->osdmap;
+  // -- there was a change! --
+  kick();
+
+  int oldrole = get_role();
+  int oldprimary = get_primary();
+
+  vector<int> oldacting, oldup; //About to get swapped with current (old)
+  osdmap.pg_to_up_acting_osds(info.pgid, oldup, oldacting);
+
+  // update PG
+  acting.swap(oldacting);
+  up.swap(oldup);
+
+  int role = osdmap.calc_pg_role(osd->whoami, acting, acting.size());
+  set_role(role);
+
+  // did acting, up, primary|acker change?
+  if (acting != oldacting || up != oldup) {
+    // remember past interval
+    PG::Interval& i = past_intervals[info.history.same_acting_since];
+    i.first = info.history.same_acting_since;
+    i.last = osdmap.get_epoch() - 1;
+    i.acting = oldacting;
+    i.up = oldup;
+
+    if (oldacting != acting) {
+      info.history.same_acting_since = osdmap.get_epoch();
+    }
+    if (oldup != up) {
+      info.history.same_up_since = osdmap.get_epoch();
+    }
+
+    if (i.acting.size()) {
+      i.maybe_went_rw = 
+       (lastmap.get_up_thru(i.acting[0]) >= i.first &&
+        lastmap.get_up_from(i.acting[0]) <= i.first) ||
+       i.first == info.history.epoch_created;
+    } else {
+      i.maybe_went_rw = 0;
+    }
+
+    if (oldprimary != get_primary()) {
+      info.history.same_primary_since = osdmap.get_epoch();
+    }
+
+    dout(10) << *this << " noting past " << i << dendl;
+    dirty_info = true;
+  }
+
+  dout(10) << " up " << oldup << " -> " << up 
+          << ", acting " << oldacting << " -> " << acting 
+          << ", role " << oldrole << " -> " << role << dendl; 
+
+  // deactivate.
+  state_clear(PG_STATE_ACTIVE);
+  state_clear(PG_STATE_DOWN);
+  state_clear(PG_STATE_PEERING);  // we'll need to restart peering
+  state_clear(PG_STATE_DEGRADED);
+  state_clear(PG_STATE_REPLAY);
+  state_clear(PG_STATE_CRASHED);
+
+  osd->cancel_generate_backlog(this);
+
+  peer_missing.clear();
+
+  if (is_primary()) {
+    if (osdmap.get_pg_size(info.pgid) != acting.size())
+      state_set(PG_STATE_DEGRADED);
+  }
+
+  // reset primary state?
+  if (oldrole == 0 || get_role() == 0)
+    clear_primary_state();
+
+    
+  // pg->on_*
+  /* TODO on_osd_failure does NOTHING! */
+#if 0
+  for (unsigned i=0; i<oldacting.size(); i++)
+    if (osdmap.is_down(oldacting[i]))
+      ->on_osd_failure(oldacting[i]);
+#endif
+
+  on_change();
+
+  if (deleting) {
+    dout(10) << *this << " canceling deletion!" << dendl;
+    deleting = false;
+    osd->remove_wq.dequeue(this);
+  }
+    
+  if (role != oldrole) {
+    // old primary?
+    if (oldrole == 0) {
+      state_clear(PG_STATE_CLEAN);
+      clear_stats();
+       
+      // take replay queue waiters
+      list<Message*> ls;
+      for (map<eversion_t,MOSDOp*>::iterator it = replay_queue.begin();
+          it != replay_queue.end();
+          it++)
+       ls.push_back(it->second);
+      replay_queue.clear();
+      osd->take_waiters(ls);
+    }
+
+    on_role_change();
+
+    // take active waiters
+    osd->take_waiters(waiting_for_active);
+
+    // new primary?
+    if (role == 0) {
+      // i am new primary
+      state_clear(PG_STATE_STRAY);
+    } else {
+      // i am now replica|stray.  we need to send a notify.
+      state_set(PG_STATE_STRAY);
+      have_master_log = false;
+    }
+      
+  } else {
+    // no role change.
+    // did primary change?
+    if (get_primary() != oldprimary) {    
+      // we need to announce
+      state_set(PG_STATE_STRAY);
+        
+      dout(10) << *this << " " << oldacting << " -> " << acting 
+              << ", acting primary " 
+              << oldprimary << " -> " << get_primary() 
+              << dendl;
+    } else {
+      // primary is the same.
+      if (role == 0) {
+       // i am (still) primary. but my replica set changed.
+       state_clear(PG_STATE_CLEAN);
+         
+       dout(10) << oldacting << " -> " << acting
+                << ", replicas changed" << dendl;
+      }
+    }
+  }
+  // make sure we clear out any pg_temp change requests
+  osd->pg_temp_wanted.erase(info.pgid);
+  cancel_recovery();
+
+  if (acting.empty() && up.size() && up[0] == osd->whoami) {
+    dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
+    osd->queue_want_pg_temp(info.pgid, acting);
+  }
+}
+
 unsigned int PG::Missing::num_missing() const
 {
   return missing.size();
index 4e21753f0c50a76f8edd55dc4ac0875c9dde39a3..1a1c159d1edeef92dbd8a1a64887d37e04d1205b 100644 (file)
@@ -1054,7 +1054,13 @@ public:
   void share_pg_info();
   void share_pg_log(const eversion_t &oldver);
 
+  void warm_restart();
+                   
+  bool acting_up_affected(OSDMap &osdmap,
+              const OSDMap &lastmap);
   // abstract bits
+  virtual bool handle_advance_map(OSDMap &osdmap,
+                                 const OSDMap &lastmap);
   virtual void do_op(MOSDOp *op) = 0;
   virtual void do_sub_op(MOSDSubOp *op) = 0;
   virtual void do_sub_op_reply(MOSDSubOpReply *op) = 0;