]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: throttle cap acquisition via readdir
authorKotresh HR <khiremat@redhat.com>
Wed, 7 Oct 2020 11:33:48 +0000 (17:03 +0530)
committerKotresh HR <khiremat@redhat.com>
Thu, 7 Jan 2021 10:07:44 +0000 (15:37 +0530)
A trivial "find" command on a large directory hierarchy will cause the
client to receive caps significantly faster than it will release. The
MDS will try to have the client reduce its caps below the
mds_max_caps_per_client limit but the recall throttles prevent it from
catching up to the pace of acquisition. The solution is to throttle
readdir from client. This patch does the same.

The readdir is throttled on the condition that the number of caps
acquired is greater than certain percentage of mds_max_caps_per_client
(default is 10%) and cap acquisition via readdir is certain percentage
of mds_max_caps_per_client (the default is 50%). When the above
condition is met, the readdir request is retried after
'mds_cap_acquisition_throttle_retry_request_timeout' (default is 0.5)
seconds.

Fixes: https://tracker.ceph.com/issues/47307
Signed-off-by: Kotresh HR <khiremat@redhat.com>
(cherry picked from commit c0de657d3f99f8a3a0d89576dff2f8e98f5f8974)

Conflicts:
  src/mds/MDSRank.cc: mds_heartbeat_grace config is not tracked by MDSRankDispatcher in nautilus
  src/mds/Server.h: Per session client metrics is not available in nautilus
  src/mds/Server.cc: Per session client metrics and inode delegation during replayed requests
                     fix is not present in nautilus
  src/mds/SessionMap.h: Session class is not restructured in nautilus
  qa/tasks/cephfs/cephfs_test_case.py: Using 'rank=None' in perf_dump throws error. Used 'rank=0'

qa/tasks/cephfs/cephfs_test_case.py
qa/tasks/cephfs/test_client_limits.py
src/common/options.cc
src/mds/MDSRank.cc
src/mds/Server.cc
src/mds/Server.h
src/mds/SessionMap.cc
src/mds/SessionMap.h

index 5ca8f0d54837769e76f8c407910cf390b50c9be1..f901f44ba494473bfad6f8e357ff9d47c9f2d574 100644 (file)
@@ -229,6 +229,9 @@ class CephFSTestCase(CephTestCase):
     def _session_by_id(self, session_ls):
         return dict([(s['id'], s) for s in session_ls])
 
+    def perf_dump(self, rank=0, status=None):
+        return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
+
     def wait_until_evicted(self, client_id, timeout=30):
         def is_client_evicted():
             ls = self._session_list()
index e32259795cb6624a70b7add85460a4c14e473dde..613a405a6a3191d68236651c79ae955e85123e08 100644 (file)
@@ -150,6 +150,36 @@ class TestClientLimits(CephFSTestCase):
         else:
             raise RuntimeError("expected no client recall warning")
 
+    def test_cap_acquisition_throttle_readdir(self):
+        """
+        Mostly readdir acquires caps faster than the mds recalls, so the cap
+        acquisition via readdir is throttled by retrying the readdir after
+        a fraction of second (0.5) by default when throttling condition is met.
+        """
+
+        max_caps_per_client = 500
+        cap_acquisition_throttle = 250
+
+        self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
+        self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
+
+        # Create 1500 files split across 6 directories, 250 each.
+        for i in range(1, 7):
+            self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # recursive readdir
+        self.mount_a.run_shell_payload("find | wc")
+
+        # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
+        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
+        self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
+
+        # validate the throttle condition to be hit atleast once
+        cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
+        self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
+
     def test_client_release_bug(self):
         """
         When a client has a bug (which we will simulate) preventing it from releasing caps,
index a4bd01a0f18ac8473958c3f503b8de9221772680..c891c71e92db32f9d88b1a741aac53e57b9f6d53 100644 (file)
@@ -7846,6 +7846,24 @@ std::vector<Option> get_mds_options() {
     .set_description("decay magnitude for preemptively recalling caps on quiet client")
     .set_long_description("This is the order of magnitude difference (in base 2) of the internal liveness decay counter and the number of capabilities the session holds. When this difference occurs, the MDS treats the session as quiescent and begins recalling capabilities."),
 
+    Option("mds_session_cap_acquisition_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("decay rate for session readdir caps leading to readdir throttle")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_long_description("The half-life for the session cap acquisition counter of caps acquired by readdir. This is used for throttling readdir requests from clients slow to release caps."),
+
+    Option("mds_session_cap_acquisition_throttle", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500000)
+    .set_description("throttle point for cap acquisition decay counter"),
+
+    Option("mds_session_max_caps_throttle_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.1)
+    .set_description("ratio of mds_max_maps_per_client that client must exceed before readdir may be throttled by cap acquisition throttle"),
+
+    Option("mds_cap_acquisition_throttle_retry_request_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_description("timeout in seconds after which a client request is retried due to cap acquisition throttling"),
+
     Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(30)
     .set_description(""),
index 728dd636b23713f71a371f29c0b4d2db027218d7..1633caec402a90cca40e6a45547b25f504bb1f79 100644 (file)
@@ -3744,6 +3744,11 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
     "mds_request_load_average_decay_rate",
     "mds_session_cache_liveness_decay_rate",
     "mds_replay_unsafe_with_closed_session",
+    "mds_session_cap_acquisition_decay_rate",
+    "mds_max_caps_per_client",
+    "mds_session_cap_acquisition_throttle",
+    "mds_session_max_caps_throttle_ratio",
+    "mds_cap_acquisition_throttle_retry_request_time",
     NULL
   };
   return KEYS;
index 853679530d8c10eec01d2ff45048ae1d87a78656..5d0be194dedc08bd7e122e374fb6be31419543a9 100644 (file)
@@ -112,6 +112,9 @@ void Server::create_logger()
                       PerfCountersBuilder::PRIO_INTERESTING);
   plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
                       "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
+                      "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
+                      PerfCountersBuilder::PRIO_INTERESTING);
 
   // fop latencies are useful
   plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
@@ -196,6 +199,10 @@ Server::Server(MDSRank *m) :
   max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
   replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
   cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+  max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+  max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+  caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
   supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
 }
 
@@ -1117,6 +1124,18 @@ void Server::handle_conf_change(const std::set<std::string>& changed) {
     dout(20) << __func__ << " max snapshots per directory changed to "
             << max_snaps_per_dir << dendl;
   }
+  if (changed.count("mds_max_caps_per_client")) {
+    max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  }
+  if (changed.count("mds_session_cap_acquisition_throttle")) {
+    cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+  }
+  if (changed.count("mds_session_max_caps_throttle_ratio")) {
+    max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+  }
+  if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
+    caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+  }
 }
 
 /*
@@ -4260,6 +4279,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
 void Server::handle_client_readdir(MDRequestRef& mdr)
 {
   const MClientRequest::const_ref &req = mdr->client_request;
+  Session *session = mds->get_session(req);
   client_t client = req->get_source().num();
   MutationImpl::LockOpVec lov;
   CInode *diri = rdlock_path_pin_ref(mdr, 0, lov, false, true);
@@ -4273,6 +4293,19 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
     return;
   }
 
+  auto num_caps = session->get_num_caps();
+  auto session_cap_acquisition = session->get_cap_acquisition();
+
+  if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
+      dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
+              << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
+      if (logger)
+          logger->inc(l_mdss_cap_acquisition_throttle);
+
+      mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+  }
+
   lov.add_rdlock(&diri->filelock);
   lov.add_rdlock(&diri->dirfragtreelock);
 
@@ -4470,6 +4503,8 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
     mdcache->lru.lru_touch(dn);
   }
   
+  session->touch_readdir_cap(numfiles);
+
   __u16 flags = 0;
   if (end) {
     flags = CEPH_READDIR_FRAG_END;
index 2c5177cdb0257edbf6c98fb5593cfc7603fee3e1..715e8496cdb8dcb80ed1eda2b9ab3d9f9e282250 100644 (file)
@@ -76,6 +76,7 @@ enum {
   l_mdss_req_symlink_latency,
   l_mdss_req_unlink_latency,
   l_mdss_cap_revoke_eviction,
+  l_mdss_cap_acquisition_throttle,
   l_mdss_last,
 };
 
@@ -355,6 +356,12 @@ private:
 
   DecayCounter recall_throttle;
   time last_recall_state;
+
+  // Cache cap acquisition throttle configs
+  uint64_t max_caps_per_client;
+  uint64_t cap_acquisition_throttle;
+  double max_caps_throttle_ratio;
+  double caps_throttle_retry_request_timeout;
 };
 
 static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {
index ae80cef5d4bb1b559c2969d0f685a0826d36c47d..34e6132f788d554e6e39c1df123aa7eb9189109e 100644 (file)
@@ -589,6 +589,7 @@ void Session::dump(Formatter *f) const
   f->dump_object("recall_caps_throttle", recall_caps_throttle);
   f->dump_object("recall_caps_throttle2o", recall_caps_throttle2o);
   f->dump_object("session_cache_liveness", session_cache_liveness);
+  f->dump_object("cap_acquisition", cap_acquisition);
   info.dump(f);
 }
 
@@ -1076,6 +1077,13 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
     };
     apply_to_open_sessions(mut);
   }
+  if (changed.count("mds_session_cap_acquisition_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate");
+    auto mut = [d](auto s) {
+      s->cap_acquisition = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
 }
 
 void SessionMap::update_average_session_age() {
index f14dfb4dc36bbf8e9150b07e128062aba43cdbda..dd7721ccf4a076f510e473e6184c6370509c2d24 100644 (file)
@@ -131,6 +131,9 @@ private:
   // session caps liveness
   DecayCounter session_cache_liveness;
 
+  // cap acquisition via readdir
+  DecayCounter cap_acquisition;
+
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
   // than at the time of adding a session to the sessionmap
@@ -210,6 +213,9 @@ public:
   auto get_session_cache_liveness() const {
     return session_cache_liveness.get();
   }
+  auto get_cap_acquisition() const {
+    return cap_acquisition.get();
+  }
 
   inodeno_t next_ino() const {
     if (info.prealloc_inos.empty())
@@ -311,6 +317,10 @@ public:
     }
   }
 
+  void touch_readdir_cap(uint32_t count) {
+    cap_acquisition.hit(count);
+  }
+
   void touch_cap(Capability *cap) {
     session_cache_liveness.hit(1.0);
     caps.push_front(&cap->item_session_caps);
@@ -426,6 +436,7 @@ public:
     recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
     recall_caps_throttle2o(0.5),
     session_cache_liveness(g_conf().get_val<double>("mds_session_cache_liveness_decay_rate")),
+    cap_acquisition(g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate")),
     birth_time(clock::now()),
     auth_caps(g_ceph_context),
     item_session_list(this),