mds: recall caps incrementally

author Patrick Donnelly <pdonnell@redhat.com>

Wed, 23 Jan 2019 14:41:55 +0000 (06:41 -0800)

committer Patrick Donnelly <pdonnell@redhat.com>

Mon, 4 Mar 2019 17:19:18 +0000 (09:19 -0800)
author Patrick Donnelly <pdonnell@redhat.com>
Wed, 23 Jan 2019 14:41:55 +0000 (06:41 -0800)
committer Patrick Donnelly <pdonnell@redhat.com>
Mon, 4 Mar 2019 17:19:18 +0000 (09:19 -0800)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 88add63e24449280802c664960cb68f91fc4df3a..1218bef68e7fe37fa3fa47fb4ebedfc71db40c98 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -16,6 +16,10 @@
    via the `ceph tell mds.<foo> cache drop` command or large reductions in the
    cache size will no longer cause service unavailability.
  
+* The CephFS MDS behavior with recalling caps has been significantly improved
+  to not attempt recalling too many caps at once, leading to instability.
+  MDS with a large cache (64GB+) should be more stable.
+
  >= 12.1.2
  ---------
  * When running 'df' on a CephFS filesystem comprising exactly one data pool,
diff --git a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml

index 410606225f031dcac580e43320b12afdc33471c6..f0ed3366c7575c7d72a9affbfcd9739f3a834408 100644 (file)
--- a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
+++ b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
@@ -10,7 +10,6 @@ overrides:
  tasks:
  - exec:
      mon.a:
-    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
      - "ceph tell mds.* config set mds_min_caps_per_client 1"
  - background_exec:
      mon.a:
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py

index 1f1d5467079ccf60027e05d18e08a0fcabf6cf58..322bd8c895ec2a40ae7d4ceb489d8f27d5f19b71 100644 (file)
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -47,7 +47,6 @@ class TestClientLimits(CephFSTestCase):
  
          mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
          self.assertTrue(open_files >= mds_min_caps_per_client)
-        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
  
          mount_a_client_id = self.mount_a.get_global_id()
          path = "subdir/mount_a" if use_subdir else "mount_a"
@@ -84,14 +83,13 @@ class TestClientLimits(CephFSTestCase):
  
          # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
          # which depend on the caps outstanding, cache size and overall ratio
-        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
          def expected_caps():
              num_caps = self.get_session(mount_a_client_id)['num_caps']
              if num_caps < mds_min_caps_per_client:
                  raise RuntimeError("client caps fell below min!")
              elif num_caps == mds_min_caps_per_client:
                  return True
-            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+            elif num_caps < cache_size:
                  return True
              else:
                  return False
diff --git a/src/common/options.cc b/src/common/options.cc

index 57468a332088f73ae4847d9a2a1deca52c3f4f91..3497b06a1fa7f0d7db6badecc85c25bd86533031 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -6150,6 +6150,22 @@ std::vector<Option> get_mds_options() {
      .set_default(1024)
      .set_description(""),
  
+    Option("mds_recall_max_caps", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
      Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(60)
      .set_description(""),
@@ -6526,9 +6542,6 @@ std::vector<Option> get_mds_options() {
      .set_default(100)
      .set_description("minimum number of capabilities a client may hold"),
  
-    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.8)
-    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
      Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
       .set_default(0)
       .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc

index 891487f2abb8601cbc1978e9d2e8da08f695aab1..60dbdda20d5c7c6dcbe5763aafa9de0b2cd19e2b 100644 (file)
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -387,35 +387,39 @@ void Beacon::notify_health(MDSRank const *mds)
    {
      set<Session*> sessions;
      mds->sessionmap.get_client_session_set(sessions);
+    auto now = clock::now();
  
-    auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
-    auto last_recall = mds->mdcache->last_recall_state;
-    auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
+    const auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
+    const auto last_recall = mds->server->last_recalled();
+    const auto last_recall_span = std::chrono::duration<double>(now-last_recall).count();
+    const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
  
      std::list<MDSHealthMetric> late_recall_metrics;
      std::list<MDSHealthMetric> large_completed_requests_metrics;
      for (auto& session : sessions) {
-      if (session->recalled_at != Session::time::min()) {
-        auto last_recall_sent = session->last_recall_sent;
-        auto recalled_at = session->recalled_at;
-        auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
-        dout(20) << "Session servicing RECALL " << session->info.inst
-          << ": " << recalled_at_span << "s ago " << session->recall_release_count
-          << "/" << session->recall_count << dendl;
-       if (recall_state_timedout || last_recall_sent < last_recall) {
+      const auto& recall_release_count = session->recall_release_count;
+      const auto& recall_count = session->recall_count;
+      if (recall_release_count < recall_count) {
+        const auto& recalled_at = session->recalled_at;
+        const auto& released_at = session->released_at;
+        const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
+        const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
+
+        dout(20) << "Session " << session->info.inst
+          << " last released " << recall_release_count << "/" << recall_count << " caps "
+          << released_at_span << "s ago " << dendl;
+       if (recall_state_timedout) {
           dout(20) << "  no longer recall" << dendl;
-         session->clear_recalled_at();
-       } else if (recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+          session->clear_recalled();
+       } else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
+          dout(20) << "  exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
            std::ostringstream oss;
           oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
            MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
            m.metadata["client_id"] = stringify(session->info.inst.name.num());
            late_recall_metrics.push_back(m);
          } else {
-          dout(20) << "  within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+          dout(20) << "  within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
          }
        }
        if ((session->get_num_trim_requests_warnings() > 0 &&
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc

index f36ba40002d3f730262c7ff188b576ba3077ec02..6a44598f1769a11d863c9962e27eb9f714b8217d 100644 (file)
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7545,8 +7545,7 @@ void MDCache::check_memory_usage()
    mds->mlogger->set(l_mdm_heap, last.get_heap());
  
    if (cache_toofull()) {
-    last_recall_state = clock::now();
-    mds->server->recall_client_state(-1.0, false, nullptr);
+    mds->server->recall_client_state(nullptr);
    }
  
    // If the cache size had exceeded its limit, but we're back in bounds
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h

index b791363bf4f5bf722a25424c6c09f2af72a27a80..2d1e8463aa963b004565910503de5e7e7a739d8e 100644 (file)
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -760,8 +760,6 @@ public:
    void trim_client_leases();
    void check_memory_usage();
  
-  time last_recall_state;
-
    // shutdown
  private:
    set<inodeno_t> shutdown_exporting_strays;
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc

index 8fbaa2f2ac14966f532eacd337077d22d1a37f59..e312bf7cc227f4bacc71db4cc47259cd080b95a8 100644 (file)
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -387,8 +387,10 @@ const char** MDSDaemon::get_tracked_conf_keys() const
      "mds_inject_migrator_message_loss",
      "host",
      "fsid",
-    "mds_request_load_average_decay_rate",
      "mds_cap_revoke_eviction_timeout",
+    // SessionMap
+    "mds_request_load_average_decay_rate",
+    "mds_recall_max_decay_rate",
      NULL
    };
    return KEYS;
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc

index d27251d6cf7adaf0449c0d33e9cc23ff5fb1996d..bf1ce74b3240bb1aac61ae44b8df26c017dce1e2 100644 (file)
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -248,7 +248,8 @@ public:
                 Formatter *f, Context *on_finish)
      : MDSInternalContext(mds),
        server(server), mdcache(mdcache), mdlog(mdlog),
-      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
        whoami(mds->whoami), incarnation(mds->incarnation) {
    }
  
@@ -320,11 +321,16 @@ private:
      f->open_object_section("result");
  
      MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
-    server->recall_client_state(1.0, true, gather);
+    auto result = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+    auto& throttled = result.first;
+    auto& count = result.second;
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
      if (!gather->has_subs()) {
-      handle_recall_client_state(0);
        delete gather;
-      return;
+      return handle_recall_client_state(0);
      }
  
      C_ContextTimeout *ctx = new C_ContextTimeout(
@@ -418,6 +424,7 @@ private:
    MDCache *mdcache;
    MDLog *mdlog;
    uint64_t recall_timeout;
+  mono_time recall_start;
    Formatter *f;
    Context *on_finish;
  
diff --git a/src/mds/Server.cc b/src/mds/Server.cc

index e0a1cc478127c38bfd1b29f0e038a189084cfc86..315aa4db97c4ca420e65f0a867a535c456e03230 100644 (file)
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -17,6 +17,7 @@
  
  #include <boost/config/warning_disable.hpp>
  #include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
  
  #include "MDSRank.h"
  #include "Server.h"
@@ -58,6 +59,7 @@
  #include "osd/OSDMap.h"
  
  #include <errno.h>
+#include <math.h>
  
  #include <list>
  #include <iostream>
@@ -199,7 +201,8 @@ Server::Server(MDSRank *m) :
    reconnect_done(NULL),
    failed_reconnects(0),
    reconnect_evicting(false),
-  terminating_sessions(false)
+  terminating_sessions(false),
+  recall_counter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"))
  {
  }
  
@@ -883,6 +886,9 @@ void Server::handle_conf_change(const struct md_config_t *conf,
      dout(20) << __func__ << " cap revoke eviction timeout changed to "
              << cap_revoke_eviction_timeout << dendl;
    }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
+  }
  }
  
  /*
@@ -1216,62 +1222,102 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
    }
  }
  
-
  /**
   * Call this when the MDCache is oversized, to send requests to the clients
   * to trim some caps, and consequently unpin some inodes in the MDCache so
   * that it can trim too.
   */
-void Server::recall_client_state(double ratio, bool flush_client_session,
-                                 MDSGatherBuilder *gather) {
-  if (flush_client_session) {
-    assert(gather != nullptr);
-  }
-
-  /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
-  uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
-  if (max_caps_per_client < min_caps_per_client) {
-    dout(0) << "max_caps_per_client " << max_caps_per_client
-            << " < min_caps_per_client " << min_caps_per_client << dendl;
-    max_caps_per_client = min_caps_per_client + 1;
-  }
-
-  /* unless this ratio is smaller: */
-  /* ratio: determine the amount of caps to recall from each client. Use
-   * percentage full over the cache reservation. Cap the ratio at 80% of client
-   * caps. */
-  if (ratio < 0.0)
-    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
-
-  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
-           << min_caps_per_client << "-" << max_caps_per_client << dendl;
-
-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
-
-  for (auto &session : sessions) {
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = flags&RecallFlags::STEADY;
+
+  const auto min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf->get_val<uint64_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_max_decay_threshold");
+
+  dout(10) << __func__ << ": caps per client " << min_caps_per_client << "/" << Capability::count() << dendl;
+
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session](auto& s) {
+    caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(s->caps.size()), std::forward_as_tuple(s));
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));
+
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& throttled = result.first;
+  auto& caps_recalled = result.second;
+  last_recall_state = now;
+  for (const auto p : boost::adaptors::reverse(caps_session)) {
+    auto& num_caps = p.first;
+    auto& session = p.second;
      if (!session->is_open() ||
          !session->connection.get() ||
         !session->info.inst.name.is_client())
        continue;
  
      dout(10) << " session " << session->info.inst
-            << " caps " << session->caps.size()
+            << " caps " << num_caps
              << ", leases " << session->leases.size()
              << dendl;
  
-    uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
-    if (session->caps.size() > newlim) {
-      MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_counter = session->cap_recalled_counter();
+      const uint64_t global_recall_counter = recall_counter.get(ceph_clock_now());
+      if (session_recall_counter+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      const auto& recalled_at = session->recalled_at;
+      auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
+      const auto& released_at = session->released_at;
+      const auto& last_recall_count = session->recall_count;
+      const auto& last_recall_release_count = session->recall_release_count;
+      const auto& last_recall_limit = session->recall_limit;
+      bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
+      if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
+        /* The session has recently (1hr) been asked to release caps and we
+         * were unable to get at least half of the recalled caps.
+         */
+        dout(15) << "  last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
+                 << " caps " << last_recalled << "s ago; released "
+                 << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
+        continue;
+      }
+
+      auto m = new MClientSession(CEPH_SESSION_RECALL_STATE);
        m->head.max_caps = newlim;
        mds->send_message_client(m, session);
-      if (flush_client_session) {
+      if (gather) {
          flush_session(session, gather);
        }
        session->notify_recall_sent(newlim);
+      recall_counter.hit(ceph_clock_now(), recall);
+      caps_recalled += recall;
      }
    }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
  }
  
  void Server::force_clients_readonly()
diff --git a/src/mds/Server.h b/src/mds/Server.h

index 4fff7ae6fe17c807e86f6831e0a46f70f61c7fe4..ab16a55bca24c73bdd7ebe5cd5de62c422d2a2b4 100644 (file)
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -17,6 +17,8 @@
  
  #include <boost/utility/string_view.hpp>
  
+#include <common/DecayCounter.h>
+
  #include "MDSRank.h"
  #include "Mutation.h"
  
@@ -121,6 +123,9 @@ public:
    void dump_reconnect_status(Formatter *f) const;
  
    void handle_client_session(class MClientSession *m);
+  time last_recalled() const {
+    return last_recall_state;
+  }
    void _session_logged(Session *session, uint64_t state_seq, 
                        bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
    version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
@@ -141,8 +146,11 @@ public:
    void reconnect_tick();
    void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
  
-  void recall_client_state(double ratio, bool flush_client_session,
-                           MDSGatherBuilder *gather);
+  enum RecallFlags {
+    NONE = 0,
+    STEADY = (1<<0),
+  };
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
    void force_clients_readonly();
  
    // -- requests --
@@ -323,6 +331,9 @@ public:
  private:
    void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
    void flush_session(Session *session, MDSGatherBuilder *gather);
+
+  DecayCounter recall_counter;
+  time last_recall_state;
  };
  
  #endif
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc

index a3ddd8e3aeabfc7de0bdf65894dcca4c976a8099..8b58295e45ea097e906d98bcb9e1efbeab376440 100644 (file)
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -859,10 +859,12 @@ size_t Session::get_request_count()
   */
  void Session::notify_cap_release(size_t n_caps)
  {
-  if (recalled_at != time::min()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count)
-      clear_recalled_at();
+  recall_release_count += n_caps;
+  if (n_caps > 0) {
+    released_at = clock::now();
+    if (recall_count <= recall_release_count) {
+      clear_recalled();
+    }
    }
  }
  
@@ -874,23 +876,36 @@ void Session::notify_cap_release(size_t n_caps)
   */
  void Session::notify_recall_sent(const size_t new_limit)
  {
-  if (recalled_at == time::min()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = last_recall_sent = clock::now();
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
+  const auto num_caps = caps.size();
+  const auto count = num_caps-new_limit;
+
+  /* Entering recall phase, set up counters so we can later judge whether the
+   * client has respected the recall request. Update only if client has not
+   * released caps from a previous recall.
+   */
+
+  if (recall_limit != new_limit) {
+    const auto now = clock::now();
+    recalled_at = now;
+    assert (new_limit < num_caps);  // Behaviour of Server::recall_client_state
+    recall_count = count;
      recall_release_count = 0;
-  } else {
-    last_recall_sent = clock::now();
+    recall_limit = new_limit;
    }
+
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  cap_recalled.hit(count);
  }
  
-void Session::clear_recalled_at()
+void Session::clear_recalled()
  {
-  recalled_at = last_recall_sent = time::min();
    recall_count = 0;
    recall_release_count = 0;
+  recall_limit = 0;
  }
  
  /**
@@ -983,23 +998,41 @@ void SessionMap::hit_session(Session *session) {
  }
  
  void SessionMap::handle_conf_change(const struct md_config_t *conf,
-                                    const std::set <std::string> &changed) {
+                                    const std::set <std::string> &changed)
+{
+
    if (changed.count("mds_request_load_average_decay_rate")) {
-    decay_rate = conf->get_val<double>("mds_request_load_average_decay_rate");
-    dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
+    auto d = g_conf->get_val<double>("mds_request_load_average_decay_rate");
+    dout(20) << __func__ << " decay rate changed to " << d << dendl;
  
-    total_load_avg_rate = DecayRate(decay_rate);
+    decay_rate = d;
+    total_load_avg = DecayCounter(ceph_clock_now(), d);
  
-    auto p = by_state.find(Session::STATE_OPEN);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_max_decay_rate");
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->cap_recalled = DecayCounter(ceph_clock_now(), d);
        }
      }
-    p = by_state.find(Session::STATE_STALE);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->cap_recalled = DecayCounter(ceph_clock_now(), d);
        }
      }
    }
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h

index 44b3d11a0a4e2f7767e7fd5ef5938787e49a32a9..24cf2113f9276f43378e5dd9e2628de6b735f716 100644 (file)
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -27,6 +27,7 @@ using std::set;
  #include "mdstypes.h"
  #include "mds/MDSAuthCaps.h"
  #include "common/perf_counters.h"
+#include "common/DecayCounter.h"
  
  class CInode;
  struct MDRequestImpl;
@@ -112,6 +113,9 @@ private:
    mutable DecayCounter load_avg;
    DecayRate    load_avg_rate;
  
+  // caps being recalled recently by this session
+  DecayCounter cap_recalled;
+
    // session start time -- used to track average session time
    // note that this is initialized in the constructor rather
    // than at the time of adding a session to the sessionmap
@@ -151,10 +155,11 @@ public:
    std::string get_human_name() const {return human_name;}
  
    // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = time::min();  // When was I asked to SESSION_RECALL?
-  time last_recall_sent = time::min();
+  time recalled_at = time::min();  // When was I first asked to SESSION_RECALL?
+  time released_at = time::min();  // When did the session last release caps?
    uint32_t recall_count = 0;  // How many caps was I asked to SESSION_RECALL?
    uint32_t recall_release_count = 0;  // How many caps have I actually revoked?
+  uint32_t recall_limit = 0;  // New limit in SESSION_RECALL
  
    session_info_t info;                         ///< durable bits
  
@@ -172,7 +177,10 @@ public:
  
    void notify_cap_release(size_t n_caps);
    void notify_recall_sent(const size_t new_limit);
-  void clear_recalled_at();
+  auto cap_recalled_counter() const {
+    return cap_recalled.get(ceph_clock_now());
+  }
+  void clear_recalled();
  
    inodeno_t next_ino() const {
      if (info.prealloc_inos.empty())
@@ -369,6 +377,7 @@ public:
  
    Session() = delete;
    Session(ConnectionRef con) :
+    cap_recalled(g_conf->get_val<double>("mds_recall_max_decay_rate")),
      birth_time(clock::now()),
      auth_caps(g_ceph_context),
      item_session_list(this),
@@ -511,10 +520,7 @@ public:
    void update_average_session_age();
  
    SessionMap() = delete;
-  explicit SessionMap(MDSRank *m)
-  :
-    mds(m)
-  {}
+  explicit SessionMap(MDSRank *m) : mds(m) {}
  
    ~SessionMap() override
    {
@@ -608,12 +614,20 @@ public:
  
    void dump();
  
-  void get_client_session_set(set<Session*>& s) const {
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-        p != session_map.end();
-        ++p)
-      if (p->second->info.inst.name.is_client())
-       s.insert(p->second);
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+       f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](Session* s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
    }
  
    void replay_open_sessions(map<client_t,entity_inst_t>& client_map) {
author	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 23 Jan 2019 14:41:55 +0000 (06:41 -0800)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Mon, 4 Mar 2019 17:19:18 +0000 (09:19 -0800)
PendingReleaseNotes		patch \| blob \| history
qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml		patch \| blob \| history
qa/tasks/cephfs/test_client_limits.py		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mds/Beacon.cc		patch \| blob \| history
src/mds/MDCache.cc		patch \| blob \| history
src/mds/MDCache.h		patch \| blob \| history
src/mds/MDSDaemon.cc		patch \| blob \| history
src/mds/MDSRank.cc		patch \| blob \| history
src/mds/Server.cc		patch \| blob \| history
src/mds/Server.h		patch \| blob \| history
src/mds/SessionMap.cc		patch \| blob \| history
src/mds/SessionMap.h		patch \| blob \| history