mon: handle monitor lag when killing mgrs

author John Spray <john.spray@redhat.com>

Thu, 12 Oct 2017 10:57:50 +0000 (11:57 +0100)

committer John Spray <john.spray@redhat.com>

Thu, 12 Oct 2017 10:59:54 +0000 (11:59 +0100)
author John Spray <john.spray@redhat.com>
Thu, 12 Oct 2017 10:57:50 +0000 (11:57 +0100)
committer John Spray <john.spray@redhat.com>
Thu, 12 Oct 2017 10:59:54 +0000 (11:59 +0100)
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc

index befab26d9dcb6b5b732cd9e282abae0ab78b6704..9a340d950eff710f8046636b88061413107b62c3 100644 (file)
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -490,7 +490,28 @@ void MgrMonitor::tick()
      return;
  
    const auto now = ceph::coarse_mono_clock::now();
-  const auto cutoff = now - std::chrono::seconds(g_conf->get_val<int64_t>("mon_mgr_beacon_grace"));
+
+  const auto mgr_beacon_grace = std::chrono::seconds(
+      g_conf->get_val<int64_t>("mon_mgr_beacon_grace"));
+
+  // Note that this is the mgr daemon's tick period, not ours (the
+  // beacon is sent with this period).
+  const auto mgr_tick_period = std::chrono::seconds(
+      g_conf->get_val<int64_t>("mgr_tick_period"));
+
+  if (last_tick != ceph::coarse_mono_clock::time_point::min()
+      && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
+    // This case handles either local slowness (calls being delayed
+    // for whatever reason) or cluster election slowness (a long gap
+    // between calls while an election happened)
+    dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+            "(slow election?) of " << now - last_tick << " seconds" << dendl;
+    for (auto &i : last_beacon) {
+      i.second = now;
+    }
+  }
+
+  last_tick = now;
  
    // Populate any missing beacons (i.e. no beacon since MgrMonitor
    // instantiation) with the current time, so that they will
@@ -508,6 +529,7 @@ void MgrMonitor::tick()
    // Cull standbys first so that any remaining standbys
    // will be eligible to take over from the active if we cull him.
    std::list<uint64_t> dead_standbys;
+  const auto cutoff = now - mgr_beacon_grace;
    for (const auto &i : pending_map.standbys) {
      auto last_beacon_time = last_beacon.at(i.first);
      if (last_beacon_time < cutoff) {
@@ -566,6 +588,7 @@ void MgrMonitor::on_restart()
  {
    // Clear out the leader-specific state.
    last_beacon.clear();
+  last_tick = ceph::coarse_mono_clock::now();
  }
  
  
diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h

index 2f24c4ad1ce299b6a4923f358105a1b138959c16..5507615af43526008b48aeaf963a4c40eadfe0a1 100644 (file)
--- a/src/mon/MgrMonitor.h
+++ b/src/mon/MgrMonitor.h
@@ -115,6 +115,11 @@ public:
    void count_metadata(const string& field, std::map<string,int> *out);
  
    friend class C_Updated;
+
+  // When did the mon last call into our tick() method?  Used for detecting
+  // when the mon was not updating us for some period (e.g. during slow
+  // election) to reset last_beacon timeouts
+  ceph::coarse_mono_clock::time_point last_tick;
  };
  
  #endif
author	John Spray <john.spray@redhat.com>
	Thu, 12 Oct 2017 10:57:50 +0000 (11:57 +0100)
committer	John Spray <john.spray@redhat.com>
	Thu, 12 Oct 2017 10:59:54 +0000 (11:59 +0100)
src/mon/MgrMonitor.cc		patch \| blob \| history
src/mon/MgrMonitor.h		patch \| blob \| history