]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
PG: make choose_acting a bit smarter
authorSamuel Just <samuel.just@dreamhost.com>
Tue, 17 May 2011 22:59:32 +0000 (15:59 -0700)
committerJosh Durgin <josh.durgin@dreamhost.com>
Thu, 19 May 2011 00:04:17 +0000 (17:04 -0700)
This change allows old strays that don't need backlogs
to stay acting until current members of the up set are caught up.
This allows the up set to maintain its full size during peering.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Signed-off-by: Samuel Just <samuel.just@dreamhost.com>
src/osd/OSD.cc
src/osd/PG.cc
src/osd/PG.h

index 1b547f17f9067ca1ed57106c027b00686c9973e3..4e45f2df16b6d0f3dc1ed097bd2c160e653c5dfd 100644 (file)
@@ -3935,8 +3935,14 @@ void OSD::do_infos(map<int,MOSDPGInfo*>& info_map)
 {
   for (map<int,MOSDPGInfo*>::iterator p = info_map.begin();
        p != info_map.end();
-       ++p) 
+       ++p) { 
+    for (vector<PG::Info>::iterator i = p->second->pg_info.begin();
+        i != p->second->pg_info.end();
+        ++i) {
+      dout(20) << "Sending info " << *i << " to osd" << p->first << dendl;
+    }
     cluster_messenger->send_message(p->second, osdmap->get_cluster_inst(p->first));
+  }
   info_map.clear();
 }
 
index 8e5b5f4c8f4d6b9d3ebbe87008e0e9f8b92a59a3..39461941cfa87cd53a2100e0b533410dafd43866 100644 (file)
@@ -1249,37 +1249,59 @@ void PG::clear_primary_state()
   osd->snap_trim_wq.dequeue(this);
 }
 
-bool PG::choose_acting(int newest_update_osd)
+bool PG::choose_acting(int newest_update_osd) const
 {
-  vector<int> want = up;
-  
-  Info& newest = (newest_update_osd == osd->whoami) ? info : peer_info[newest_update_osd];
-  Info& oprimi = (want[0] == osd->whoami) ? info : peer_info[want[0]];
-  if (newest_update_osd != want[0] &&
-      oprimi.last_update < newest.log_tail && !newest.log_backlog) {
-    // up[0] needs a backlog to catch up
-    // make newest_update_osd primary instead?
-    for (unsigned i=1; i<want.size(); i++)
-      if (want[i] == newest_update_osd) {
-       dout(10) << "choose_acting  up[0] osd" << want[0] << " needs backlog to catch up, making "
-                << want[i] << " primary" << dendl;
-       want[0] = want[i];
-       want[i] = up[0];
-       break;
-      }
+  vector<int> want;
+
+  const Info &best_info = newest_update_osd == osd->whoami ? 
+    info : peer_info.find(newest_update_osd)->second;
+  dout(10) << "best_info is " << best_info << dendl;
+  for (vector<int>::const_iterator i = up.begin();
+       i != up.end();
+       ++i) {
+    const Info &pi = *i == osd->whoami ? info : peer_info.find(*i)->second;
+    dout(10) << "Considering osd" << *i << dendl;
+    if (best_info.log_tail <= pi.last_update || log.backlog) {
+      // Can be brought up to date without stopping to generate a backlog
+      want.push_back(*i);
+      dout(10) << "osd" << *i << "Accepted" << dendl;
+    } else {
+      dout(10) << "osd" << *i << "REJECTED" << dendl;
+    }
+  }
+
+  dout(10) << "considering osd" << osd->whoami << " (me) " << dendl;
+  if (want.size() == osd->osdmap->get_pg_size(info.pgid) &&
+      (best_info.log_tail <= info.last_update || log.backlog)) {
+    vector<int>::const_iterator up_it = find(up.begin(), up.end(), osd->whoami);
+    dout(10) << "osd" << osd->whoami << " (me) accepted" << dendl;
+    if (up_it == up.end()) {
+      want.push_back(osd->whoami);
+    }
+  } else {
+    dout(10) << "osd" << osd->whoami << " (me) rejected" << dendl;
   }
-  // exclude peers who need backlogs to catch up?
-  Info& primi = (want[0] == osd->whoami) ? info : peer_info[want[0]];
-  for (vector<int>::iterator p = want.begin() + 1; p != want.end(); ) {
-    Info& pi = (*p == osd->whoami) ? info : peer_info[*p];
-    if (pi.last_update < primi.log_tail && !primi.log_backlog) {
-      dout(10) << "choose_acting  osd" << *p << " needs primary backlog to catch up" << dendl;
-      p = want.erase(p);
+
+  for (map<int, Info>::const_iterator i = peer_info.begin();
+       i != peer_info.end();
+       ++i) {
+    if (want.size() == osd->osdmap->get_pg_size(info.pgid)) {
+      break;
+    }
+    vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first);
+    if (up_it != up.end()) {
+      continue;
+    }
+    dout(10) << "Considering osd" << *i << dendl;
+    if (best_info.log_tail <= i->second.last_update || log.backlog) {
+      // Can be brought up to date without stopping to generate a backlog
+      want.push_back(i->first);
+      dout(10) << "osd" << *i << "Accepted" << dendl;
     } else {
-      dout(10) << "choose_acting  osd" << *p << " can catch up with osd" << want[0] << " log" << dendl;
-      p++;
+      dout(10) << "osd" << *i << "REJECTED" << dendl;
     }
   }
+
   if (want != acting) {
     dout(10) << "choose_acting  want " << want << " != acting " << acting
             << ", requesting pg_temp change" << dendl;
@@ -1294,31 +1316,36 @@ bool PG::choose_acting(int newest_update_osd)
   return true;
 }
 
-void PG::choose_log_location(const PgPriorSet &prior_set,
+bool PG::choose_log_location(const PgPriorSet &prior_set,
                             bool &need_backlog,
                             bool &wait_on_backlog,
                             int &pull_from,
                             eversion_t &newest_update,
                             eversion_t &oldest_update) const
 {
-  // Find the osd with the most recent update
   pull_from = -1;
+  const Info *best_info = &info;
   need_backlog = false;
   wait_on_backlog = false;
-  const Info *best_info = &info;
-  for (map<int, Info>::const_iterator it = peer_info.begin();
-       it != peer_info.end();
-       ++it) {
-    // Only consider osds in the prior set
-    if (prior_set.cur.find(it->first) == prior_set.cur.end()) {
+  oldest_update = info.last_update;
+  newest_update = info.last_update;
+
+  for (map<int, Info>::const_iterator i = peer_info.begin();
+       i != peer_info.end();
+       ++i) {
+    if (prior_set.cur.find(i->first) == prior_set.cur.end()) {
       continue;
     }
-    if (best_info->last_update < it->second.last_update) {
-      best_info = &(it->second);
-      pull_from = it->first;
+    if (i->second.last_update > best_info->last_update) {
+      best_info = &(i->second);
+      pull_from = i->first;
+      newest_update = i->second.last_update;
+    }
+    if (oldest_update > i->second.last_update) {
+      oldest_update = i->second.last_update;
     }
   }
-  newest_update = best_info->last_update;
+
   if (pull_from >= 0)
     dout(10) << "choose_log_location newest_update " << newest_update
             << " on osd" << pull_from << dendl;
@@ -1326,6 +1353,10 @@ void PG::choose_log_location(const PgPriorSet &prior_set,
     dout(10) << "choose_log_location newest_update " << newest_update
             << " (local)" << dendl;
 
+  if (!choose_acting(pull_from == -1 ? osd->whoami : pull_from)) {
+    return false;
+  }
+
   for (vector<int>::const_iterator it = ++acting.begin();
        it != acting.end();
        ++it) {
@@ -1336,15 +1367,11 @@ void PG::choose_log_location(const PgPriorSet &prior_set,
     }
   }
 
-  oldest_update = info.last_update;
   for (vector<int>::const_iterator it = up.begin();
        it != up.end();
        ++it) {
     if (*it == osd->whoami) continue;
     const Info &pi = peer_info.find(*it)->second;
-    if (oldest_update > pi.last_update) {
-      oldest_update = pi.last_update;
-    }
 
     vector<int>::const_iterator acting_it = find(acting.begin(), acting.end(), *it);
     if (acting_it != acting.end())
@@ -1357,6 +1384,7 @@ void PG::choose_log_location(const PgPriorSet &prior_set,
       need_backlog = true;
     }
   }
+
   // check our own info -- we aren't in peer_info
   if (best_info->log_tail > info.last_update) {
     wait_on_backlog = true;
@@ -1368,6 +1396,7 @@ void PG::choose_log_location(const PgPriorSet &prior_set,
     dout(10) << "must generate backlog because my last_complete " << info.last_complete
             << " < log.tail " << info.log_tail << " and no backlog" << dendl;
     need_backlog = true;
+    wait_on_backlog = true;
   }
   for (vector<int>::const_iterator it = ++acting.begin();
        it != acting.end();
@@ -1388,6 +1417,7 @@ void PG::choose_log_location(const PgPriorSet &prior_set,
           << (need_backlog ? " need_backlog" : "")
           << (wait_on_backlog ? " wait_on_backlog" : "")
           << dendl;
+  return true;
 }
 
 /* Build the might_have_unfound set.
@@ -4348,14 +4378,12 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx) :
 
   eversion_t newest_update;
   eversion_t oldest_update;
-  pg->choose_log_location(*context< Peering >().prior_set.get(),
-                         need_backlog,
-                         wait_on_backlog,
-                         newest_update_osd,
-                         newest_update,
-                         oldest_update);
-
-  if (!pg->choose_acting(newest_update_osd == -1 ? pg->osd->whoami : newest_update_osd)) {
+  if (!pg->choose_log_location(*context< Peering >().prior_set.get(),
+                              need_backlog,
+                              wait_on_backlog,
+                              newest_update_osd,
+                              newest_update,
+                              oldest_update)) {
     post_event(NeedNewMap());
   } else {
     if (need_backlog && !pg->log.backlog) {
index 9455a857008247a355f966fc50a2e015ab3d74dd..ccd79952ceca5b3d4cecd15cc7899ce26fba9683 100644 (file)
@@ -1371,14 +1371,14 @@ public:
   
   void trim_write_ahead();
 
-  bool choose_acting(int newest_update_osd);
+  bool choose_acting(int newest_update_osd) const;
   bool recover_master_log(map< int, map<pg_t,Query> >& query_map,
                          eversion_t &oldest_update);
   eversion_t calc_oldest_known_update() const;
   void do_peer(ObjectStore::Transaction& t, list<Context*>& tfin,
               map< int, map<pg_t,Query> >& query_map,
               map<int, MOSDPGInfo*> *activator_map=0);
-  void choose_log_location(const PgPriorSet &prior_set,
+  bool choose_log_location(const PgPriorSet &prior_set,
                           bool &need_backlog,
                           bool &wait_on_backlog,
                           int &pull_from,