mon: cache osd epochs

author Sage Weil <sage@inktank.com>

Fri, 26 Apr 2013 19:22:28 +0000 (12:22 -0700)

committer Sage Weil <sage@inktank.com>

Fri, 26 Apr 2013 23:04:16 +0000 (16:04 -0700)
author Sage Weil <sage@inktank.com>
Fri, 26 Apr 2013 19:22:28 +0000 (12:22 -0700)
committer Sage Weil <sage@inktank.com>
Fri, 26 Apr 2013 23:04:16 +0000 (16:04 -0700)
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index 60e0f2c1b39b08babcc171041594a3d6b176aba0..e8a277a7b01dba338c97eb8b5ed5b0d5ac324759 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -156,13 +156,24 @@ void OSDMonitor::update_from_paxos()
    if (!t.empty())
      mon->store->apply_transaction(t);
  
-  // populate down -> out map
-  for (int o = 0; o < osdmap.get_max_osd(); o++)
-    if (osdmap.is_down(o) && osdmap.is_in(o) &&
-       down_pending_out.count(o) == 0) {
-      dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
-      down_pending_out[o] = ceph_clock_now(g_ceph_context);
+  for (int o = 0; o < osdmap.get_max_osd(); o++) {
+    if (osdmap.is_down(o)) {
+      // invalidate osd_epoch cache
+      osd_epoch.erase(o);
+
+      // populate down -> out map
+      if (osdmap.is_in(o) &&
+         down_pending_out.count(o) == 0) {
+       dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
+       down_pending_out[o] = ceph_clock_now(g_ceph_context);
+      }
      }
+  }
+  // blow away any osd_epoch items beyond max_osd
+  map<int,epoch_t>::iterator p = osd_epoch.upper_bound(osdmap.get_max_osd());
+  while (p != osd_epoch.end()) {
+    osd_epoch.erase(p++);
+  }
  
    if (mon->is_leader()) {
      // kick pgmon, make sure it's seen the latest map
@@ -1495,7 +1506,21 @@ void OSDMonitor::send_full(PaxosServiceMessage *m)
  void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
  {
    dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
-         << " to " << req->get_orig_source_inst() << dendl;
+         << " to " << req->get_orig_source_inst()
+         << dendl;
+
+  int osd = -1;
+  if (req->get_source().is_osd()) {
+    osd = req->get_source().num();
+    map<int,epoch_t>::iterator p = osd_epoch.find(osd);
+    if (p != osd_epoch.end()) {
+      dout(10) << " osd." << osd << " should have epoch " << p->second << dendl;
+      first = p->second + 1;
+      if (first > osdmap.get_epoch())
+       return;
+    }
+  }
+
    if (first < get_first_committed()) {
      first = get_first_committed();
      bufferlist bl;
@@ -1511,6 +1536,9 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
      m->newest_map = osdmap.get_epoch();
      m->maps[first] = bl;
      mon->send_reply(req, m);
+
+    if (osd >= 0)
+      osd_epoch[osd] = osdmap.get_epoch();
      return;
    }
  
@@ -1521,6 +1549,9 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
    m->oldest_map = get_first_committed();
    m->newest_map = osdmap.get_epoch();
    mon->send_reply(req, m);
+
+  if (osd >= 0)
+    osd_epoch[osd] = last;
  }
  
  void OSDMonitor::send_incremental(epoch_t first, entity_inst_t& dest, bool onetime)
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h

index 036aed5ffd3729c9fe3cd5a9363418704346b781..0034bb0baca4bbb4ea8e5585f7e54d7b8b4a1501 100644 (file)
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -123,6 +123,12 @@ private:
  
    map<int,double> osd_weight;
  
+  /*
+   * cache what epochs we think osds have.  this is purely
+   * optimization to try to avoid sending the same inc maps twice.
+   */
+  map<int,epoch_t> osd_epoch;
+
    void check_failures(utime_t now);
    bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
author	Sage Weil <sage@inktank.com>
	Fri, 26 Apr 2013 19:22:28 +0000 (12:22 -0700)
committer	Sage Weil <sage@inktank.com>
	Fri, 26 Apr 2013 23:04:16 +0000 (16:04 -0700)
src/mon/OSDMonitor.cc		patch \| blob \| history
src/mon/OSDMonitor.h		patch \| blob \| history