mds: simplify recall warnings

author Patrick Donnelly <pdonnell@redhat.com>

Mon, 28 Jan 2019 23:48:38 +0000 (15:48 -0800)

committer Patrick Donnelly <pdonnell@redhat.com>

Thu, 31 Jan 2019 20:07:54 +0000 (12:07 -0800)
author Patrick Donnelly <pdonnell@redhat.com>
Mon, 28 Jan 2019 23:48:38 +0000 (15:48 -0800)
committer Patrick Donnelly <pdonnell@redhat.com>
Thu, 31 Jan 2019 20:07:54 +0000 (12:07 -0800)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index e04ce58793165c979a9621147335cbd430f168b2..50bc4831ba8254e3c006ddaab076a6cbe5f4d160 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -179,6 +179,12 @@
    MDS when all of these caps need to be processed during certain session
    events. It is recommended to not unnecessarily increase this value.
  
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+  warnings are now generated based on the number of caps the MDS has recalled
+  which have not been released. The new configs mds_recall_warning_threshold
+  (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+  threshold for this warning.
+
  >=13.1.0
  --------
  
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py

index 833a37172569fe11e964f5b7fa57eb4cf3cbcae4..bc029cd8a5ab8300a99c599bfda7eda7b96aeb45 100644 (file)
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -42,10 +42,13 @@ class TestClientLimits(CephFSTestCase):
          cache_size = open_files/2
  
          self.set_conf('mds', 'mds cache size', cache_size)
+        self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+        self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
          self.fs.mds_fail_restart()
          self.fs.wait_for_daemons()
  
          mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
          self.assertTrue(open_files >= mds_min_caps_per_client)
  
          mount_a_client_id = self.mount_a.get_global_id()
@@ -63,13 +66,11 @@ class TestClientLimits(CephFSTestCase):
  
          # MDS should not be happy about that, as the client is failing to comply
          # with the SESSION_RECALL messages it is being sent
-        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
  
          # We can also test that the MDS health warning for oversized
          # cache is functioning as intended.
-        self.wait_for_health("MDS_CACHE_OVERSIZED",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
  
          # When the client closes the files, it should retain only as many caps as allowed
          # under the SESSION_RECALL policy
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h

index 21d7204980473e25b36148d30e06210c748eb358..a71d2dac51a3651e9eddfc99ddb30487e723c514 100644 (file)
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -410,7 +410,6 @@ OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist cl
  OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
  
  OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
  OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
  OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
  OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
diff --git a/src/common/options.cc b/src/common/options.cc

index 547b8b8757a333443f29004c50a78c46d1852bd6..245cac9af8f122ac2a434375466cc8b597b9bcb1 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -7247,9 +7247,13 @@ std::vector<Option> get_mds_options() {
      .set_default(64_K)
      .set_description("decay threshold for throttle on recalled caps globally"),
  
-    Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description("timeout for clients late on cap recall to create health warnings"),
+    Option("mds_recall_warning_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),
  
      Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
      .set_default(30)
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc

index d2bb5cac6f214668d50a8fc788b447214fc032db..07330520c50a537a4bc1b52c3d36827f8ff1daf4 100644 (file)
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -373,45 +373,28 @@ void Beacon::notify_health(MDSRank const *mds)
    {
      set<Session*> sessions;
      mds->sessionmap.get_client_session_set(sessions);
-    auto now = clock::now();
-
-    const auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
-    const auto last_recall = mds->server->last_recalled();
-    const auto last_recall_span = std::chrono::duration<double>(now-last_recall).count();
-    const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
  
+    const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf()->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
      std::list<MDSHealthMetric> late_recall_metrics;
      std::list<MDSHealthMetric> large_completed_requests_metrics;
      for (auto& session : sessions) {
-      const auto& recall_release_count = session->recall_release_count;
-      const auto& recall_count = session->recall_count;
-      if (recall_release_count < recall_count) {
-        const auto& recalled_at = session->recalled_at;
-        const auto& released_at = session->released_at;
-        const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
-        const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
-
-        dout(20) << "Session " << session->info.inst
-          << " last released " << recall_release_count << "/" << recall_count << " caps "
-          << released_at_span << "s ago " << dendl;
-       if (recall_state_timedout) {
-         dout(20) << "  no longer recall" << dendl;
-          session->clear_recalled();
-       } else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
-          std::ostringstream oss;
-         oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
-          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = stringify(session->get_client());
-          late_recall_metrics.push_back(m);
-        } else {
-          dout(20) << "  within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
-        }
+      const uint64_t recall_caps = session->get_recall_caps();
+      if (recall_caps > recall_warning_threshold) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.push_back(m);
        }
        if ((session->get_num_trim_requests_warnings() > 0 &&
-          session->get_num_completed_requests() >= g_conf()->mds_max_completed_requests) ||
+          session->get_num_completed_requests() >= max_completed_requests) ||
           (session->get_num_trim_flushes_warnings() > 0 &&
-          session->get_num_completed_flushes() >= g_conf()->mds_max_completed_flushes)) {
+          session->get_num_completed_flushes() >= max_completed_flushes)) {
         std::ostringstream oss;
         oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
         MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
diff --git a/src/mds/Server.cc b/src/mds/Server.cc

index ead97a5f4bd790cb78ea4cbcb2e4e60fb9c8b5ee..455a6d0ef4872ca44b222e0eaab346e5ec4452ed 100644 (file)
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -191,7 +191,7 @@ Server::Server(MDSRank *m) :
    failed_reconnects(0),
    reconnect_evicting(false),
    terminating_sessions(false),
-  recall_counter(g_conf().get_val<double>("mds_recall_max_decay_rate"))
+  recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
  {
    supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
  }
@@ -1076,7 +1076,7 @@ void Server::handle_conf_change(const ConfigProxy& conf,
              << cap_revoke_eviction_timeout << dendl;
    }
    if (changed.count("mds_recall_max_decay_rate")) {
-    recall_counter = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+    recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
    }
  }
  
@@ -1533,7 +1533,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
    const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
    const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
  
-  dout(10) << __func__ << ":"
+  dout(7) << __func__ << ":"
             << " min=" << min_caps_per_client
             << " max=" << max_caps_per_client
             << " total=" << Capability::count()
@@ -1559,7 +1559,8 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
         !session->info.inst.name.is_client())
        continue;
  
-    dout(10) << " session " << session->info.inst
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
              << " caps " << num_caps
              << ", leases " << session->leases.size()
              << dendl;
@@ -1574,36 +1575,49 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
        /* now limit the number of caps we recall at a time to prevent overloading ourselves */
        uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
        newlim = num_caps-recall;
-      const uint64_t session_recall_counter = session->cap_recalled_counter();
-      const uint64_t global_recall_counter = recall_counter.get();
-      if (session_recall_counter+recall > recall_max_decay_threshold) {
-        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t global_recall_throttle = recall_throttle.get();
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
          throttled = true;
          continue;
-      } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
-        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
          throttled = true;
          break;
        }
  
        // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
-      const auto& recalled_at = session->recalled_at;
-      auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
-      const auto& released_at = session->released_at;
-      const auto& last_recall_count = session->recall_count;
-      const auto& last_recall_release_count = session->recall_release_count;
-      const auto& last_recall_limit = session->recall_limit;
-      bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
-      if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
-        /* The session has recently (1hr) been asked to release caps and we
-         * were unable to get at least half of the recalled caps.
-         */
-        dout(15) << "  last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
-                 << " caps " << last_recalled << "s ago; released "
-                 << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
-        continue;
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
        }
  
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
        auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
        m->head.max_caps = newlim;
        mds->send_message_client(m, session);
@@ -1611,7 +1625,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
          flush_session(session, gather);
        }
        caps_recalled += session->notify_recall_sent(newlim);
-      recall_counter.hit(recall);
+      recall_throttle.hit(recall);
      }
    }
  
diff --git a/src/mds/Server.h b/src/mds/Server.h

index fc504e3249566adfba649bc0625516fe34cf7589..6bb84b061513968b33ec6dd66cd6e62e21f23a0f 100644 (file)
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -350,7 +350,7 @@ private:
    void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply);
    void flush_session(Session *session, MDSGatherBuilder *gather);
  
-  DecayCounter recall_counter;
+  DecayCounter recall_throttle;
    time last_recall_state;
  };
  
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc

index bf30a0fed91829e2c99451b4399dc81f0176c67d..340ad0fa541437ee524c3a974928ed32711c9147 100644 (file)
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -855,13 +855,8 @@ size_t Session::get_request_count()
   */
  void Session::notify_cap_release(size_t n_caps)
  {
-  recall_release_count += n_caps;
-  if (n_caps > 0) {
-    released_at = clock::now();
-    if (recall_count <= recall_release_count) {
-      clear_recalled();
-    }
-  }
+  recall_caps.hit(-(double)n_caps);
+  release_caps.hit(n_caps);
  }
  
  /**
@@ -870,25 +865,13 @@ void Session::notify_cap_release(size_t n_caps)
   * in order to generate health metrics if the session doesn't see
   * a commensurate number of calls to ::notify_cap_release
   */
-uint64_t Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
  {
    const auto num_caps = caps.size();
-  ceph_assert(new_limit < num_caps);
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
    const auto count = num_caps-new_limit;
-
-  /* Entering recall phase, set up counters so we can later judge whether the
-   * client has respected the recall request. Update only if client has not
-   * released caps from a previous recall.
-   */
-
    uint64_t new_change;
    if (recall_limit != new_limit) {
-    const auto now = clock::now();
-    recalled_at = now;
-    assert (new_limit < num_caps);  // Behaviour of Server::recall_client_state
-    recall_count = count;
-    recall_release_count = 0;
-    recall_limit = new_limit;
      new_change = count;
    } else {
      new_change = 0; /* no change! */
@@ -899,17 +882,11 @@ uint64_t Session::notify_recall_sent(const size_t new_limit)
     * session that is not releasing caps (i.e. allow the session counter to
     * throttle future RECALL messages).
     */
-  cap_recalled.hit(count);
+  recall_caps_throttle.hit(count);
+  recall_caps.hit(count);
    return new_change;
  }
  
-void Session::clear_recalled()
-{
-  recall_count = 0;
-  recall_release_count = 0;
-  recall_limit = 0;
-}
-
  /**
   * Use client metadata to generate a somewhat-friendlier
   * name for the client than its session ID.
@@ -1031,7 +1008,15 @@ void SessionMap::handle_conf_change(const ConfigProxy &conf,
    if (changed.count("mds_recall_max_decay_rate")) {
      auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
      auto mut = [d](auto s) {
-      s->cap_recalled = DecayCounter(d);
+      s->recall_caps_throttle = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_warning_decay_rate");
+    auto mut = [d](auto s) {
+      s->recall_caps = DecayCounter(d);
+      s->release_caps = DecayCounter(d);
      };
      apply_to_open_sessions(mut);
    }
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h

index e3530f9566118de8cedb611f759a3a434512f0cf..b7f747684136c9678c97574160dd7b57f6b02ce8 100644 (file)
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -113,8 +113,15 @@ private:
    // request load average for this session
    DecayCounter load_avg;
  
-  // caps being recalled recently by this session
-  DecayCounter cap_recalled;
+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  DecayCounter recall_caps;
+  // caps that have been released
+  DecayCounter release_caps;
+  // throttle on caps recalled
+  DecayCounter recall_caps_throttle;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
  
    // session start time -- used to track average session time
    // note that this is initialized in the constructor rather
@@ -156,13 +163,6 @@ public:
  
    const std::string& get_human_name() const {return human_name;}
  
-  // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = clock::zero();  // When was I first asked to SESSION_RECALL?
-  time released_at = clock::zero();  // When did the session last release caps?
-  uint32_t recall_count = 0;  // How many caps was I asked to SESSION_RECALL?
-  uint32_t recall_release_count = 0;  // How many caps have I actually revoked?
-  uint32_t recall_limit = 0;  // New limit in SESSION_RECALL
-
    session_info_t info;                         ///< durable bits
  
    MDSAuthCaps auth_caps;
@@ -181,11 +181,16 @@ public:
    interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
  
    void notify_cap_release(size_t n_caps);
-  uint64_t notify_recall_sent(const size_t new_limit);
-  auto cap_recalled_counter() const {
-    return cap_recalled.get();
+  uint64_t notify_recall_sent(size_t new_limit);
+  auto get_recall_caps_throttle() const {
+    return recall_caps_throttle.get();
+  }
+  auto get_recall_caps() const {
+    return recall_caps.get();
+  }
+  auto get_release_caps() const {
+    return release_caps.get();
    }
-  void clear_recalled();
  
    inodeno_t next_ino() const {
      if (info.prealloc_inos.empty())
@@ -384,7 +389,9 @@ public:
  
    Session() = delete;
    Session(ConnectionRef con) :
-    cap_recalled(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+    recall_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
      birth_time(clock::now()),
      auth_caps(g_ceph_context),
      item_session_list(this),
author	Patrick Donnelly <pdonnell@redhat.com>
	Mon, 28 Jan 2019 23:48:38 +0000 (15:48 -0800)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 31 Jan 2019 20:07:54 +0000 (12:07 -0800)
PendingReleaseNotes		patch \| blob \| history
qa/tasks/cephfs/test_client_limits.py		patch \| blob \| history
src/common/legacy_config_opts.h		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mds/Beacon.cc		patch \| blob \| history
src/mds/Server.cc		patch \| blob \| history
src/mds/Server.h		patch \| blob \| history
src/mds/SessionMap.cc		patch \| blob \| history
src/mds/SessionMap.h		patch \| blob \| history