]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: track both up and acting sets in PG, PG::Interval
authorSage Weil <sage@newdream.net>
Thu, 6 Aug 2009 20:02:53 +0000 (13:02 -0700)
committerSage Weil <sage@newdream.net>
Mon, 10 Aug 2009 20:14:07 +0000 (13:14 -0700)
src/osd/OSD.cc
src/osd/OSDMap.h
src/osd/PG.cc
src/osd/PG.h

index 309cd4764f4e782e1661169caea6c408cc82c4e9..3f9ad3d0dec7902d8e1c14b3bc40501448263ed1 100644 (file)
@@ -678,6 +678,7 @@ PG *OSD::_create_lock_new_pg(pg_t pgid, vector<int>& acting, ObjectStore::Transa
 
   pg->set_role(0);
   pg->acting.swap(acting);
+  pg->up = pg->acting;
   pg->info.history.epoch_created = 
     pg->info.history.last_epoch_started =
     pg->info.history.same_since =
@@ -2094,9 +2095,9 @@ void OSD::advance_map(ObjectStore::Transaction& t)
     PG *pg = it->second;
 
     // get new acting set
-    vector<int> tacting;
-    int nrep = osdmap->pg_to_acting_osds(pgid, tacting);
-    int role = osdmap->calc_pg_role(whoami, tacting, nrep);
+    vector<int> tup, tacting;
+    osdmap->pg_to_up_acting_osds(pgid, tup, tacting);
+    int role = osdmap->calc_pg_role(whoami, tacting, tacting.size());
 
     pg->lock();
 
@@ -2112,8 +2113,9 @@ void OSD::advance_map(ObjectStore::Transaction& t)
     }
     
     // no change?
-    if (tacting == pg->acting && (pg->is_active() || !pg->prior_set_affected(osdmap))) {
-      dout(15) << *pg << " unchanged|active with " << tacting << dendl;
+    if (tacting == pg->acting && tup == pg->up &&
+       (pg->is_active() || !pg->prior_set_affected(osdmap))) {
+      dout(15) << *pg << " unchanged|active with " << tup << "/" << tacting << " up/acting" << dendl;
       pg->unlock();
       continue;
     }
@@ -2122,12 +2124,14 @@ void OSD::advance_map(ObjectStore::Transaction& t)
     int oldrole = pg->get_role();
     int oldprimary = pg->get_primary();
     vector<int> oldacting = pg->acting;
+    vector<int> oldup = pg->up;
     
     pg->kick();
     pg->clear_prior();
 
     // update PG
     pg->acting.swap(tacting);
+    pg->up.swap(tup);
     pg->set_role(role);
     
     // did acting, primary|acker change?
@@ -2135,6 +2139,7 @@ void OSD::advance_map(ObjectStore::Transaction& t)
       // remember past interval
       PG::Interval& i = pg->past_intervals[pg->info.history.same_since];
       i.acting = oldacting;
+      i.up = oldup;
       i.first = pg->info.history.same_since;
       i.last = osdmap->get_epoch() - 1;
       if (i.acting.size())
@@ -2660,14 +2665,21 @@ void OSD::handle_pg_create(MOSDPGCreate *m)
     }
    
     // is it still ours?
-    vector<int> acting;
-    int nrep = osdmap->pg_to_acting_osds(on, acting);
-    int role = osdmap->calc_pg_role(whoami, acting, nrep);
+    vector<int> up, acting;
+    osdmap->pg_to_up_acting_osds(on, up, acting);
+    int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
     if (role != 0) {
       dout(10) << "mkpg " << pgid << "  not primary (role=" << role << "), skipping" << dendl;
       continue;
     }
+    if (up != acting) {
+      dout(10) << "mkpg " << pgid << "  up " << up << " != acting " << acting << dendl;
+      stringstream ss;
+      ss << "mkpg " << pgid << " up " << up << " != acting " << acting;
+      logclient.log(LOG_ERROR, ss);
+      continue;
+    }
 
     // does it already exist?
     if (_have_pg(pgid)) {
@@ -2810,9 +2822,9 @@ void OSD::handle_pg_notify(MOSDPGNotify *m)
 
     if (!_have_pg(pgid)) {
       // same primary?
-      vector<int> acting;
-      int nrep = osdmap->pg_to_acting_osds(pgid, acting);
-      int role = osdmap->calc_pg_role(whoami, acting, nrep);
+      vector<int> up, acting;
+      osdmap->pg_to_up_acting_osds(pgid, up, acting);
+      int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
       PG::Info::History history = it->history;
       project_pg_history(pgid, history, m->get_epoch(), acting);
@@ -2846,6 +2858,7 @@ void OSD::handle_pg_notify(MOSDPGNotify *m)
       if (!pg) {
        pg = _create_lock_pg(pgid, t);
        pg->acting.swap(acting);
+       pg->up.swap(up);
        pg->set_role(role);
        pg->info.history = history;
        pg->clear_primary_state();  // yep, notably, set hml=false
@@ -2931,9 +2944,9 @@ void OSD::_process_pg_info(epoch_t epoch, int from,
 
   PG *pg = 0;
   if (!_have_pg(info.pgid)) {
-    vector<int> acting;
-    int nrep = osdmap->pg_to_acting_osds(info.pgid, acting);
-    int role = osdmap->calc_pg_role(whoami, acting, nrep);
+    vector<int> up, acting;
+    osdmap->pg_to_up_acting_osds(info.pgid, up, acting);
+    int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
     project_pg_history(info.pgid, info.history, epoch, acting);
     if (epoch < info.history.same_since) {
@@ -2946,6 +2959,7 @@ void OSD::_process_pg_info(epoch_t epoch, int from,
     pg = _create_lock_pg(info.pgid, t);
     dout(10) << " got info on new pg, creating" << dendl;
     pg->acting.swap(acting);
+    pg->up.swap(up);
     pg->set_role(role);
     pg->info.history = info.history;
     pg->write_info(t);
@@ -3119,9 +3133,9 @@ void OSD::handle_pg_query(MOSDPGQuery *m)
 
     if (pg_map.count(pgid) == 0) {
       // get active crush mapping
-      vector<int> acting;
-      int nrep = osdmap->pg_to_acting_osds(pgid, acting);
-      int role = osdmap->calc_pg_role(whoami, acting, nrep);
+      vector<int> up, acting;
+      osdmap->pg_to_up_acting_osds(pgid, up, acting);
+      int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
       // same primary?
       PG::Info::History history = it->second.history;
@@ -3144,6 +3158,7 @@ void OSD::handle_pg_query(MOSDPGQuery *m)
       ObjectStore::Transaction t;
       pg = _create_lock_pg(pgid, t);
       pg->acting.swap( acting );
+      pg->up.swap( up );
       pg->set_role(role);
       pg->info.history = history;
       pg->write_info(t);
index 667265faefdb315fc16b85984f54fb2b8c01d0d7..b4d59501f029acbf5f98b447569fa812a1a10242 100644 (file)
@@ -761,7 +761,7 @@ private:
 
     map<pg_t,vector<int> >::iterator p = pg_temp.find(pg);
     if (p != pg_temp.end())
-      raw = p->second;
+      raw = p->second;      
     else
       pg_to_osds(pg, raw);
     
@@ -773,6 +773,29 @@ private:
     return osds.size();
   }
 
+  void pg_to_up_acting_osds(pg_t pg, vector<int>& up, vector<int>& acting) {
+    // get rush list
+    vector<int> raw;
+    pg_to_osds(pg, raw);
+
+    up.clear();
+    for (unsigned i=0; i<raw.size(); i++) {
+      if (!exists(raw[i]) || is_down(raw[i])) continue;
+      up.push_back(raw[i]);
+    }
+    
+    map<pg_t,vector<int> >::iterator p = pg_temp.find(pg);
+    if (p != pg_temp.end()) {
+      raw = p->second;
+      acting.clear();
+      for (unsigned i=0; i<raw.size(); i++) {
+       if (!exists(raw[i]) || is_down(raw[i])) continue;
+       acting.push_back(raw[i]);
+      }
+    } else
+      acting = up;
+  }
+
 
   int lookup_pg_pool_name(const char *name) {
     if (name_pool.count(name))
index 1aa3e0afd0169e641b08427febbc4c509b8df9d7..04b2bb3aa01b57cbf742efb803e443ff7c397609 100644 (file)
@@ -717,8 +717,8 @@ void PG::generate_past_intervals()
        last_epoch >= stop;
        last_epoch = first_epoch - 1) {
     OSDMap *lastmap = nextmap;
-    vector<int> tacting;
-    lastmap->pg_to_acting_osds(get_pgid(), tacting);
+    vector<int> tup, tacting;
+    lastmap->pg_to_up_acting_osds(get_pgid(), tup, tacting);
     
     // calc first_epoch, first_map
     for (first_epoch = last_epoch; first_epoch > stop; first_epoch--) {
@@ -732,6 +732,7 @@ void PG::generate_past_intervals()
     Interval &i = past_intervals[first_epoch];
     i.first = first_epoch;
     i.last = last_epoch;
+    i.up.swap(tup);
     i.acting.swap(tacting);
     if (i.acting.size()) {
       i.maybe_went_rw = 
index 80c3bf2c399038cb3cb77808b70e4a03f7b73f6d..a8cdc62217f65aabf41f1e73b86c8f86168f7029 100644 (file)
@@ -622,19 +622,21 @@ public:
 
 public:
   struct Interval {
-    vector<int> acting;
+    vector<int> up, acting;
     epoch_t first, last;
     bool maybe_went_rw;
 
     void encode(bufferlist& bl) const {
       ::encode(first, bl);
       ::encode(last, bl);
+      ::encode(up, bl);
       ::encode(acting, bl);
       ::encode(maybe_went_rw, bl);
     }
     void decode(bufferlist::iterator& bl) {
       ::decode(first, bl);
       ::decode(last, bl);
+      ::decode(up, bl);
       ::decode(acting, bl);
       ::decode(maybe_went_rw, bl);
     }
@@ -670,7 +672,7 @@ public:
 
   // primary state
  public:
-  vector<int> acting;
+  vector<int> up, acting;
   map<int,eversion_t> peer_last_complete_ondisk;
   eversion_t  min_last_complete_ondisk;  // min over last_complete_ondisk, peer_last_complete_ondisk
   eversion_t  pg_trim_to;
@@ -992,7 +994,10 @@ inline ostream& operator<<(ostream& out, const PG::Interval& i)
 inline ostream& operator<<(ostream& out, const PG& pg)
 {
   out << "pg[" << pg.info 
-      << " r=" << pg.get_role();
+      << " " << pg.acting;
+  out << " r=" << pg.get_role();
+  if (pg.acting != pg.up)
+    out << " up=" << pg.up;
   
   if (pg.recovery_ops_active)
     out << " rops=" << pg.recovery_ops_active;