mds: recall caps incrementally

author Patrick Donnelly <pdonnell@redhat.com>

Wed, 23 Jan 2019 14:41:55 +0000 (06:41 -0800)

committer Patrick Donnelly <pdonnell@redhat.com>

Tue, 29 Jan 2019 23:16:30 +0000 (15:16 -0800)
author Patrick Donnelly <pdonnell@redhat.com>
Wed, 23 Jan 2019 14:41:55 +0000 (06:41 -0800)
committer Patrick Donnelly <pdonnell@redhat.com>
Tue, 29 Jan 2019 23:16:30 +0000 (15:16 -0800)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 61424bc62f43a8a067db35a79e066877b289e8c3..c9b869a3602361d500a08418d8f940b1d82fde55 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -169,6 +169,10 @@
    via the `ceph tell mds.<foo> cache drop` command or large reductions in the
    cache size will no longer cause service unavailability.
  
+* The CephFS MDS behavior with recalling caps has been significantly improved
+  to not attempt recalling too many caps at once, leading to instability.
+  MDS with a large cache (64GB+) should be more stable.
+
  >=13.1.0
  --------
  
diff --git a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml

index 410606225f031dcac580e43320b12afdc33471c6..f0ed3366c7575c7d72a9affbfcd9739f3a834408 100644 (file)
--- a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
+++ b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
@@ -10,7 +10,6 @@ overrides:
  tasks:
  - exec:
      mon.a:
-    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
      - "ceph tell mds.* config set mds_min_caps_per_client 1"
  - background_exec:
      mon.a:
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py

index 1f1d5467079ccf60027e05d18e08a0fcabf6cf58..322bd8c895ec2a40ae7d4ceb489d8f27d5f19b71 100644 (file)
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -47,7 +47,6 @@ class TestClientLimits(CephFSTestCase):
  
          mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
          self.assertTrue(open_files >= mds_min_caps_per_client)
-        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
  
          mount_a_client_id = self.mount_a.get_global_id()
          path = "subdir/mount_a" if use_subdir else "mount_a"
@@ -84,14 +83,13 @@ class TestClientLimits(CephFSTestCase):
  
          # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
          # which depend on the caps outstanding, cache size and overall ratio
-        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
          def expected_caps():
              num_caps = self.get_session(mount_a_client_id)['num_caps']
              if num_caps < mds_min_caps_per_client:
                  raise RuntimeError("client caps fell below min!")
              elif num_caps == mds_min_caps_per_client:
                  return True
-            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+            elif num_caps < cache_size:
                  return True
              else:
                  return False
diff --git a/src/common/options.cc b/src/common/options.cc

index 8d03bbc5c250d7ada7c539ee62464a2ea41dc27e..090e126fa788f2eb0e18c1e220f8223ee4ac7ff6 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -7231,6 +7231,22 @@ std::vector<Option> get_mds_options() {
      .set_default(1024)
      .set_description("number of omap keys to read from the SessionMap in one operation"),
  
+    Option("mds_recall_max_caps", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
      Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(60)
      .set_description("timeout for clients late on cap recall to create health warnings"),
@@ -7613,10 +7629,6 @@ std::vector<Option> get_mds_options() {
      .set_default(100)
      .set_description("minimum number of capabilities a client may hold"),
  
-    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.8)
-    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
-
      Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
       .set_default(0)
       .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc

index e2679b4350d68bbfc29be916769d259838066153..d2bb5cac6f214668d50a8fc788b447214fc032db 100644 (file)
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -373,35 +373,39 @@ void Beacon::notify_health(MDSRank const *mds)
    {
      set<Session*> sessions;
      mds->sessionmap.get_client_session_set(sessions);
+    auto now = clock::now();
  
-    auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
-    auto last_recall = mds->mdcache->last_recall_state;
-    auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
+    const auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
+    const auto last_recall = mds->server->last_recalled();
+    const auto last_recall_span = std::chrono::duration<double>(now-last_recall).count();
+    const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
  
      std::list<MDSHealthMetric> late_recall_metrics;
      std::list<MDSHealthMetric> large_completed_requests_metrics;
      for (auto& session : sessions) {
-      if (session->recalled_at != Session::clock::zero()) {
-        auto last_recall_sent = session->last_recall_sent;
-        auto recalled_at = session->recalled_at;
-        auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
-        dout(20) << "Session servicing RECALL " << session->info.inst
-          << ": " << recalled_at_span << "s ago " << session->recall_release_count
-          << "/" << session->recall_count << dendl;
-       if (recall_state_timedout || last_recall_sent < last_recall) {
+      const auto& recall_release_count = session->recall_release_count;
+      const auto& recall_count = session->recall_count;
+      if (recall_release_count < recall_count) {
+        const auto& recalled_at = session->recalled_at;
+        const auto& released_at = session->released_at;
+        const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
+        const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
+
+        dout(20) << "Session " << session->info.inst
+          << " last released " << recall_release_count << "/" << recall_count << " caps "
+          << released_at_span << "s ago " << dendl;
+       if (recall_state_timedout) {
           dout(20) << "  no longer recall" << dendl;
-         session->clear_recalled_at();
-       } else if (recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+          session->clear_recalled();
+       } else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
+          dout(20) << "  exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
            std::ostringstream oss;
           oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
            MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
            m.metadata["client_id"] = stringify(session->get_client());
            late_recall_metrics.push_back(m);
          } else {
-          dout(20) << "  within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+          dout(20) << "  within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
          }
        }
        if ((session->get_num_trim_requests_warnings() > 0 &&
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc

index 78916ffc31e09e70816844839f65728a2b52ca22..841f68d12a6638733b3959d471aff31f7f1f8efc 100644 (file)
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7565,8 +7565,7 @@ void MDCache::check_memory_usage()
    mds->mlogger->set(l_mdm_heap, last.get_heap());
  
    if (cache_toofull()) {
-    last_recall_state = clock::now();
-    mds->server->recall_client_state(-1.0, false, nullptr);
+    mds->server->recall_client_state(nullptr);
    }
  
    // If the cache size had exceeded its limit, but we're back in bounds
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h

index d18f38d24ab5887fd127463cb294f257c0f017df..9679eccc4e6ed2f4988a4f68e70fb761ae5b932c 100644 (file)
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -780,8 +780,6 @@ public:
    void trim_client_leases();
    void check_memory_usage();
  
-  time last_recall_state;
-
    // shutdown
  private:
    set<inodeno_t> shutdown_exporting_strays;
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc

index dd2ec74ba7784cc7ca1be2542f3325e7eee6093f..0eda37ef22c6359a5a82a025b47a1624bd3ef83e 100644 (file)
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -384,8 +384,10 @@ const char** MDSDaemon::get_tracked_conf_keys() const
      "mds_inject_migrator_session_race",
      "host",
      "fsid",
-    "mds_request_load_average_decay_rate",
      "mds_cap_revoke_eviction_timeout",
+    // SessionMap
+    "mds_request_load_average_decay_rate",
+    "mds_recall_max_decay_rate",
      NULL
    };
    return KEYS;
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc

index ac9ce862161f90e7956253ff3754f40b23750790..222bb0d47142d72a953579dcbfd01920e00ef73a 100644 (file)
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -245,7 +245,8 @@ public:
                 Formatter *f, Context *on_finish)
      : MDSInternalContext(mds),
        server(server), mdcache(mdcache), mdlog(mdlog),
-      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
        whoami(mds->whoami), incarnation(mds->incarnation) {
    }
  
@@ -317,11 +318,14 @@ private:
      f->open_object_section("result");
  
      MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
-    server->recall_client_state(1.0, true, gather);
+    auto [throttled, count] = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
      if (!gather->has_subs()) {
-      handle_recall_client_state(0);
        delete gather;
-      return;
+      return handle_recall_client_state(0);
      }
  
      C_ContextTimeout *ctx = new C_ContextTimeout(
@@ -413,6 +417,7 @@ private:
    MDCache *mdcache;
    MDLog *mdlog;
    uint64_t recall_timeout;
+  mono_time recall_start;
    Formatter *f;
    Context *on_finish;
  
diff --git a/src/mds/Server.cc b/src/mds/Server.cc

index 8d86287cebd1d0c13a5eefa47eeeb6aa18423d6b..146cb39ea1a16832339f2bd8482410c32b961288 100644 (file)
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -17,6 +17,7 @@
  
  #include <boost/config/warning_disable.hpp>
  #include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
  
  #include "MDSRank.h"
  #include "Server.h"
@@ -49,6 +50,7 @@
  #include "osd/OSDMap.h"
  
  #include <errno.h>
+#include <math.h>
  
  #include <list>
  #include <iostream>
@@ -188,7 +190,8 @@ Server::Server(MDSRank *m) :
    reconnect_done(NULL),
    failed_reconnects(0),
    reconnect_evicting(false),
-  terminating_sessions(false)
+  terminating_sessions(false),
+  recall_counter(g_conf().get_val<double>("mds_recall_max_decay_rate"))
  {
    supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
  }
@@ -1072,6 +1075,9 @@ void Server::handle_conf_change(const ConfigProxy& conf,
      dout(20) << __func__ << " cap revoke eviction timeout changed to "
              << cap_revoke_eviction_timeout << dendl;
    }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_counter = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+  }
  }
  
  /*
@@ -1510,62 +1516,99 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
    }
  }
  
-
  /**
   * Call this when the MDCache is oversized, to send requests to the clients
   * to trim some caps, and consequently unpin some inodes in the MDCache so
   * that it can trim too.
   */
-void Server::recall_client_state(double ratio, bool flush_client_session,
-                                 MDSGatherBuilder *gather) {
-  if (flush_client_session) {
-    assert(gather != nullptr);
-  }
-
-  /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = Capability::count() * g_conf().get_val<double>("mds_max_ratio_caps_per_client");
-  uint64_t min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
-  if (max_caps_per_client < min_caps_per_client) {
-    dout(0) << "max_caps_per_client " << max_caps_per_client
-            << " < min_caps_per_client " << min_caps_per_client << dendl;
-    max_caps_per_client = min_caps_per_client + 1;
-  }
-
-  /* unless this ratio is smaller: */
-  /* ratio: determine the amount of caps to recall from each client. Use
-   * percentage full over the cache reservation. Cap the ratio at 80% of client
-   * caps. */
-  if (ratio < 0.0)
-    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
-
-  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
-           << min_caps_per_client << "-" << max_caps_per_client << dendl;
-
-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
-
-  for (auto &session : sessions) {
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = flags&RecallFlags::STEADY;
+
+  const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
+
+  dout(10) << __func__ << ": caps per client " << min_caps_per_client << "/" << Capability::count() << dendl;
+
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session](auto& s) {
+    caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(s->caps.size()), std::forward_as_tuple(s));
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));
+
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& [throttled, caps_recalled] = result;
+  last_recall_state = now;
+  for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
      if (!session->is_open() ||
          !session->get_connection() ||
         !session->info.inst.name.is_client())
        continue;
  
      dout(10) << " session " << session->info.inst
-            << " caps " << session->caps.size()
+            << " caps " << num_caps
              << ", leases " << session->leases.size()
              << dendl;
  
-    uint64_t newlim = std::max(std::min<uint64_t>((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
-    if (session->caps.size() > newlim) {
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_counter = session->cap_recalled_counter();
+      const uint64_t global_recall_counter = recall_counter.get();
+      if (session_recall_counter+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      const auto& recalled_at = session->recalled_at;
+      auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
+      const auto& released_at = session->released_at;
+      const auto& last_recall_count = session->recall_count;
+      const auto& last_recall_release_count = session->recall_release_count;
+      const auto& last_recall_limit = session->recall_limit;
+      bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
+      if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
+        /* The session has recently (1hr) been asked to release caps and we
+         * were unable to get at least half of the recalled caps.
+         */
+        dout(15) << "  last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
+                 << " caps " << last_recalled << "s ago; released "
+                 << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
+        continue;
+      }
+
        auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
        m->head.max_caps = newlim;
        mds->send_message_client(m, session);
-      if (flush_client_session) {
+      if (gather) {
          flush_session(session, gather);
        }
        session->notify_recall_sent(newlim);
+      recall_counter.hit(recall);
+      caps_recalled += recall;
      }
    }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
  }
  
  void Server::force_clients_readonly()
diff --git a/src/mds/Server.h b/src/mds/Server.h

index d49d653abfeb86dd8accdbfa501d3a5a2aa1bb07..a9a8f09ec3d99a19d0e2324f47733875ffe8e6fa 100644 (file)
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -17,6 +17,8 @@
  
  #include <string_view>
  
+#include <common/DecayCounter.h>
+
  #include "messages/MClientReconnect.h"
  #include "messages/MClientReply.h"
  #include "messages/MClientRequest.h"
@@ -130,6 +132,10 @@ public:
    bool waiting_for_reconnect(client_t c) const;
    void dump_reconnect_status(Formatter *f) const;
  
+  time last_recalled() const {
+    return last_recall_state;
+  }
+
    void handle_client_session(const MClientSession::const_ref &m);
    void _session_logged(Session *session, uint64_t state_seq, 
                        bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
@@ -163,8 +169,11 @@ public:
    void reconnect_tick();
    void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
  
-  void recall_client_state(double ratio, bool flush_client_session,
-                           MDSGatherBuilder *gather);
+  enum RecallFlags {
+    NONE = 0,
+    STEADY = (1<<0),
+  };
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
    void force_clients_readonly();
  
    // -- requests --
@@ -339,6 +348,9 @@ public:
  private:
    void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply);
    void flush_session(Session *session, MDSGatherBuilder *gather);
+
+  DecayCounter recall_counter;
+  time last_recall_state;
  };
  
  #endif
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc

index 6d5ee3b3f33b76404535da85b6306cf54f1d3abd..94a1166feaf6434e38e875554b495a70f1ebed5e 100644 (file)
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -855,10 +855,12 @@ size_t Session::get_request_count()
   */
  void Session::notify_cap_release(size_t n_caps)
  {
-  if (recalled_at != clock::zero()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count)
-      clear_recalled_at();
+  recall_release_count += n_caps;
+  if (n_caps > 0) {
+    released_at = clock::now();
+    if (recall_count <= recall_release_count) {
+      clear_recalled();
+    }
    }
  }
  
@@ -870,23 +872,36 @@ void Session::notify_cap_release(size_t n_caps)
   */
  void Session::notify_recall_sent(const size_t new_limit)
  {
-  if (recalled_at == clock::zero()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = last_recall_sent = clock::now();
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
+  const auto num_caps = caps.size();
+  const auto count = num_caps-new_limit;
+
+  /* Entering recall phase, set up counters so we can later judge whether the
+   * client has respected the recall request. Update only if client has not
+   * released caps from a previous recall.
+   */
+
+  if (recall_limit != new_limit) {
+    const auto now = clock::now();
+    recalled_at = now;
+    assert (new_limit < num_caps);  // Behaviour of Server::recall_client_state
+    recall_count = count;
      recall_release_count = 0;
-  } else {
-    last_recall_sent = clock::now();
+    recall_limit = new_limit;
    }
+
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  cap_recalled.hit(count);
  }
  
-void Session::clear_recalled_at()
+void Session::clear_recalled()
  {
-  recalled_at = last_recall_sent = clock::zero();
    recall_count = 0;
    recall_release_count = 0;
+  recall_limit = 0;
  }
  
  /**
@@ -980,25 +995,39 @@ void SessionMap::hit_session(Session *session) {
  }
  
  void SessionMap::handle_conf_change(const ConfigProxy &conf,
-                                    const std::set <std::string> &changed) {
-  if (changed.count("mds_request_load_average_decay_rate")) {
-    decay_rate = g_conf().get_val<double>("mds_request_load_average_decay_rate");
-    dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
-
-    total_load_avg = DecayCounter(decay_rate);
-
-    auto p = by_state.find(Session::STATE_OPEN);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+                                    const std::set <std::string> &changed)
+{
+  auto apply_to_open_sessions = [this](auto f) {
+    if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        f(session);
        }
      }
-    p = by_state.find(Session::STATE_STALE);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        f(session);
        }
      }
+  };
+
+  if (changed.count("mds_request_load_average_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_request_load_average_decay_rate");
+    dout(20) << __func__ << " decay rate changed to " << d << dendl;
+
+    decay_rate = d;
+    total_load_avg = DecayCounter(d);
+
+    auto mut = [d](auto s) {
+      s->set_load_avg_decay_rate(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
+    auto mut = [d](auto s) {
+      s->cap_recalled = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
    }
  }
  
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h

index d2152c016a3f7be9057870bd97fb443e83e82f73..fe0b1ea1f815b0b62ab729dcae66a9727ffdd988 100644 (file)
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -27,6 +27,7 @@ using std::set;
  #include "mdstypes.h"
  #include "mds/MDSAuthCaps.h"
  #include "common/perf_counters.h"
+#include "common/DecayCounter.h"
  
  class CInode;
  struct MDRequestImpl;
@@ -112,6 +113,9 @@ private:
    // request load average for this session
    DecayCounter load_avg;
  
+  // caps being recalled recently by this session
+  DecayCounter cap_recalled;
+
    // session start time -- used to track average session time
    // note that this is initialized in the constructor rather
    // than at the time of adding a session to the sessionmap
@@ -153,10 +157,11 @@ public:
    const std::string& get_human_name() const {return human_name;}
  
    // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = clock::zero();  // When was I asked to SESSION_RECALL?
-  time last_recall_sent = clock::zero();
+  time recalled_at = clock::zero();  // When was I first asked to SESSION_RECALL?
+  time released_at = clock::zero();  // When did the session last release caps?
    uint32_t recall_count = 0;  // How many caps was I asked to SESSION_RECALL?
    uint32_t recall_release_count = 0;  // How many caps have I actually revoked?
+  uint32_t recall_limit = 0;  // New limit in SESSION_RECALL
  
    session_info_t info;                         ///< durable bits
  
@@ -177,7 +182,10 @@ public:
  
    void notify_cap_release(size_t n_caps);
    void notify_recall_sent(const size_t new_limit);
-  void clear_recalled_at();
+  auto cap_recalled_counter() const {
+    return cap_recalled.get();
+  }
+  void clear_recalled();
  
    inodeno_t next_ino() const {
      if (info.prealloc_inos.empty())
@@ -376,6 +384,7 @@ public:
  
    Session() = delete;
    Session(ConnectionRef con) :
+    cap_recalled(g_conf().get_val<double>("mds_recall_max_decay_rate")),
      birth_time(clock::now()),
      auth_caps(g_ceph_context),
      item_session_list(this),
@@ -524,10 +533,7 @@ public:
    void update_average_session_age();
  
    SessionMap() = delete;
-  explicit SessionMap(MDSRank *m)
-  :
-    mds(m)
-  {}
+  explicit SessionMap(MDSRank *m) : mds(m) {}
  
    ~SessionMap() override
    {
@@ -621,12 +627,20 @@ public:
  
    void dump();
  
-  void get_client_session_set(set<Session*>& s) const {
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-        p != session_map.end();
-        ++p)
-      if (p->second->info.inst.name.is_client())
-       s.insert(p->second);
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+       f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](auto& s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
    }
  
    void replay_open_sessions(map<client_t,entity_inst_t>& client_map,
author	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 23 Jan 2019 14:41:55 +0000 (06:41 -0800)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Tue, 29 Jan 2019 23:16:30 +0000 (15:16 -0800)
PendingReleaseNotes		patch \| blob \| history
qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml		patch \| blob \| history
qa/tasks/cephfs/test_client_limits.py		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mds/Beacon.cc		patch \| blob \| history
src/mds/MDCache.cc		patch \| blob \| history
src/mds/MDCache.h		patch \| blob \| history
src/mds/MDSDaemon.cc		patch \| blob \| history
src/mds/MDSRank.cc		patch \| blob \| history
src/mds/Server.cc		patch \| blob \| history
src/mds/Server.h		patch \| blob \| history
src/mds/SessionMap.cc		patch \| blob \| history
src/mds/SessionMap.h		patch \| blob \| history