]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: throttle cap acquisition via readdir 37618/head
authorKotresh HR <khiremat@redhat.com>
Wed, 7 Oct 2020 11:33:48 +0000 (17:03 +0530)
committerKotresh HR <khiremat@redhat.com>
Thu, 22 Oct 2020 13:26:43 +0000 (18:56 +0530)
A trivial "find" command on a large directory hierarchy will cause the
client to receive caps significantly faster than it will release. The
MDS will try to have the client reduce its caps below the
mds_max_caps_per_client limit but the recall throttles prevent it from
catching up to the pace of acquisition. The solution is to throttle
readdir from client. This patch does the same.

The readdir is throttled on the condition that the number of caps
acquired is greater than certain percentage of mds_max_caps_per_client
(default is 10%) and cap acquisition via readdir is certain percentage
of mds_max_caps_per_client (the default is 50%). When the above
condition is met, the readdir request is retried after
'mds_cap_acquisition_throttle_retry_request_timeout' (default is 0.5)
seconds.

Fixes: https://tracker.ceph.com/issues/47307
Signed-off-by: Kotresh HR <khiremat@redhat.com>
qa/tasks/cephfs/cephfs_test_case.py
qa/tasks/cephfs/test_client_limits.py
src/common/options.cc
src/mds/MDSRank.cc
src/mds/Server.cc
src/mds/Server.h
src/mds/SessionMap.cc
src/mds/SessionMap.h

index 036986b54b080ef205888d6218b8e20c202501f5..db4fa8e982ac420bb2709e57e1f28adb413c1fd5 100644 (file)
@@ -260,6 +260,9 @@ class CephFSTestCase(CephTestCase):
     def _session_by_id(self, session_ls):
         return dict([(s['id'], s) for s in session_ls])
 
+    def perf_dump(self, rank=None, status=None):
+        return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
+
     def wait_until_evicted(self, client_id, timeout=30):
         def is_client_evicted():
             ls = self._session_list()
index 51c3048bd63a8ca64668a0fb47263312b78f3ed3..9cfda425488cd7f34720ed700347dde8bd1e7d32 100644 (file)
@@ -156,6 +156,36 @@ class TestClientLimits(CephFSTestCase):
         else:
             raise RuntimeError("expected no client recall warning")
 
+    def test_cap_acquisition_throttle_readdir(self):
+        """
+        Mostly readdir acquires caps faster than the mds recalls, so the cap
+        acquisition via readdir is throttled by retrying the readdir after
+        a fraction of second (0.5) by default when throttling condition is met.
+        """
+
+        max_caps_per_client = 500
+        cap_acquisition_throttle = 250
+
+        self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
+        self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
+
+        # Create 1500 files split across 6 directories, 250 each.
+        for i in range(1, 7):
+            self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # recursive readdir
+        self.mount_a.run_shell_payload("find | wc")
+
+        # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
+        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
+        self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
+
+        # validate the throttle condition to be hit atleast once
+        cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
+        self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
+
     def test_client_release_bug(self):
         """
         When a client has a bug (which we will simulate) preventing it from releasing caps,
index d5a5a10a0f2f6a7c0fa85c2f0c8b0d03bd70a554..24f282bbe9379c610c0e97c06c4a00a282b78978 100644 (file)
@@ -8010,6 +8010,24 @@ std::vector<Option> get_mds_options() {
     .set_flag(Option::FLAG_RUNTIME)
     .set_long_description("This is the order of magnitude difference (in base 2) of the internal liveness decay counter and the number of capabilities the session holds. When this difference occurs, the MDS treats the session as quiescent and begins recalling capabilities."),
 
+    Option("mds_session_cap_acquisition_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("decay rate for session readdir caps leading to readdir throttle")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_long_description("The half-life for the session cap acquisition counter of caps acquired by readdir. This is used for throttling readdir requests from clients slow to release caps."),
+
+    Option("mds_session_cap_acquisition_throttle", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500000)
+    .set_description("throttle point for cap acquisition decay counter"),
+
+    Option("mds_session_max_caps_throttle_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.1)
+    .set_description("ratio of mds_max_maps_per_client that client must exceed before readdir may be throttled by cap acquisition throttle"),
+
+    Option("mds_cap_acquisition_throttle_retry_request_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_description("timeout in seconds after which a client request is retried due to cap acquisition throttling"),
+
     Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(30)
     .set_description(""),
index 6735a05cffd9766d23ec4c9ea647d28e0fbee2f7..80046d723a65035ec0ce8d8762ef40f94115da53 100644 (file)
@@ -3637,6 +3637,11 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
     "mds_request_load_average_decay_rate",
     "mds_session_cache_liveness_decay_rate",
     "mds_heartbeat_grace",
+    "mds_session_cap_acquisition_decay_rate",
+    "mds_max_caps_per_client",
+    "mds_session_cap_acquisition_throttle",
+    "mds_session_max_caps_throttle_ratio",
+    "mds_cap_acquisition_throttle_retry_request_time",
     NULL
   };
   return KEYS;
index 3d5fb415d23c7f7e6bf41e4d1ffb44689bf001d7..96e358813af6b3b830c4d73a55d435f23c156694 100644 (file)
@@ -174,6 +174,9 @@ void Server::create_logger()
                       PerfCountersBuilder::PRIO_INTERESTING);
   plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
                       "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
+                      "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
+                      PerfCountersBuilder::PRIO_INTERESTING);
 
   // fop latencies are useful
   plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
@@ -255,6 +258,10 @@ Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
   cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
   max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
   delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+  max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+  max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+  caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
   supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
 }
 
@@ -1222,6 +1229,18 @@ void Server::handle_conf_change(const std::set<std::string>& changed) {
   if (changed.count("mds_client_delegate_inos_pct")) {
     delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
   }
+  if (changed.count("mds_max_caps_per_client")) {
+    max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  }
+  if (changed.count("mds_session_cap_acquisition_throttle")) {
+    cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+  }
+  if (changed.count("mds_session_max_caps_throttle_ratio")) {
+    max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+  }
+  if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
+    caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+  }
 }
 
 /*
@@ -4468,6 +4487,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
 void Server::handle_client_readdir(MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
+  Session *session = mds->get_session(req);
   client_t client = req->get_source().num();
   MutationImpl::LockOpVec lov;
   CInode *diri = rdlock_path_pin_ref(mdr, false, true);
@@ -4481,6 +4501,19 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
     return;
   }
 
+  auto num_caps = session->get_num_caps();
+  auto session_cap_acquisition = session->get_cap_acquisition();
+
+  if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
+      dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
+              << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
+      if (logger)
+          logger->inc(l_mdss_cap_acquisition_throttle);
+
+      mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+  }
+
   lov.add_rdlock(&diri->filelock);
   lov.add_rdlock(&diri->dirfragtreelock);
 
@@ -4679,6 +4712,8 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
     mdcache->lru.lru_touch(dn);
   }
   
+  session->touch_readdir_cap(numfiles);
+
   __u16 flags = 0;
   if (end) {
     flags = CEPH_READDIR_FRAG_END;
index df0b9cb16c420ce9d7d28213cbd1d1e991b5cf41..d03d22834ff8360b03d7fe12f9217d8b5c8759d5 100644 (file)
@@ -79,6 +79,7 @@ enum {
   l_mdss_req_symlink_latency,
   l_mdss_req_unlink_latency,
   l_mdss_cap_revoke_eviction,
+  l_mdss_cap_acquisition_throttle,
   l_mdss_last,
 };
 
@@ -460,6 +461,12 @@ private:
   time last_recall_state;
 
   MetricsHandler *metrics_handler;
+
+  // Cache cap acquisition throttle configs
+  uint64_t max_caps_per_client;
+  uint64_t cap_acquisition_throttle;
+  double max_caps_throttle_ratio;
+  double caps_throttle_retry_request_timeout;
 };
 
 static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {
index fe85968abb9685a1550ae639be8d5ec3550463da..93e957829e178f5c9449f9f09df3b1dbfb824544 100644 (file)
@@ -597,6 +597,7 @@ void Session::dump(Formatter *f, bool cap_dump) const
   f->dump_object("recall_caps_throttle", recall_caps_throttle);
   f->dump_object("recall_caps_throttle2o", recall_caps_throttle2o);
   f->dump_object("session_cache_liveness", session_cache_liveness);
+  f->dump_object("cap_acquisition", cap_acquisition);
   info.dump(f);
 }
 
@@ -1087,6 +1088,13 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
     };
     apply_to_open_sessions(mut);
   }
+  if (changed.count("mds_session_cap_acquisition_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate");
+    auto mut = [d](auto s) {
+      s->cap_acquisition = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
 }
 
 void SessionMap::update_average_session_age() {
index 02be9ba594680e0506638d217f2aeca9af42fdde..1d97f5d0917bdaef920511becb8146791a0fabb5 100644 (file)
@@ -92,6 +92,7 @@ public:
     recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
     recall_caps_throttle2o(0.5),
     session_cache_liveness(g_conf().get_val<double>("mds_session_cache_liveness_decay_rate")),
+    cap_acquisition(g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate")),
     birth_time(clock::now())
   {
     set_connection(std::move(con));
@@ -167,6 +168,9 @@ public:
   auto get_session_cache_liveness() const {
     return session_cache_liveness.get();
   }
+  auto get_cap_acquisition() const {
+    return cap_acquisition.get();
+  }
 
   inodeno_t take_ino(inodeno_t ino = 0) {
     if (ino) {
@@ -289,6 +293,10 @@ public:
     }
   }
 
+  void touch_readdir_cap(uint32_t count) {
+    cap_acquisition.hit(count);
+  }
+
   void touch_cap(Capability *cap) {
     session_cache_liveness.hit(1.0);
     caps.push_front(&cap->item_session_caps);
@@ -472,6 +480,9 @@ private:
   // session caps liveness
   DecayCounter session_cache_liveness;
 
+  // cap acquisition via readdir
+  DecayCounter cap_acquisition;
+
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
   // than at the time of adding a session to the sessionmap