]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mds: blocklist clients with "bloated" session metadata
authorVenky Shankar <vshankar@redhat.com>
Fri, 11 Aug 2023 08:36:52 +0000 (04:36 -0400)
committerVenky Shankar <vshankar@redhat.com>
Tue, 22 Aug 2023 04:47:26 +0000 (00:47 -0400)
Buggy clients (or maybe a MDS bug) causes a huge buildup of
`completed_requests` metadata in its session information.
This could cause the MDS to go read-only when its flushing
session metadata to the journal since the bloated metadata
causes the ODSOp payload to exceed the maximum write size.

Blocklist such clients so as to allow the MDS to continue
servicing requests.

Fixes: http://tracker.ceph.com/issues/61947
Signed-off-by: Venky Shankar <vshankar@redhat.com>
src/common/options/mds.yaml.in
src/mds/MDSRank.cc
src/mds/SessionMap.cc
src/mds/SessionMap.h

index d320ce139db4663148a4b79e33f55c205b101f97..5a06326f9fc8b01711a42a9821c7c8b00c425c0a 100644 (file)
@@ -1577,3 +1577,13 @@ options:
   - mds
   flags:
   - runtime
+- name: mds_session_metadata_threshold
+  type: size
+  level: advanced
+  desc: Evict non-advancing client-tid sessions exceeding the config size.
+  long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
+  default: 16_M
+  services:
+  - mds
+  flags:
+  - runtime
index 31ea7f6710f92af726141b995fab189120c2211f..cc810d7e39a142f3cf9713554db234a06a4845be 100644 (file)
@@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
     "mds_session_cap_acquisition_throttle",
     "mds_session_max_caps_throttle_ratio",
     "mds_symlink_recovery",
+    "mds_session_metadata_threshold",
     NULL
   };
   return KEYS;
index c21ac469356e0f4f9ceef415c191339d6093bdb0..873dedbe68dcf975a574b977bd05e660d5800af2 100644 (file)
@@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase
 };
 };
 
+SessionMap::SessionMap(MDSRank *m)
+  : mds(m),
+    mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
+}
+
 void SessionMap::register_perfcounters()
 {
   PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
@@ -375,6 +380,11 @@ public:
 };
 }
 
+bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
+  session->info.encode(bl, mds->mdsmap->get_up_features());
+  return bl.length() < mds_session_metadata_threshold;
+}
+
 void SessionMap::save(MDSContext *onsave, version_t needv)
 {
   dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
@@ -410,6 +420,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
 
   dout(20) << " updating keys:" << dendl;
   map<string, bufferlist> to_set;
+  std::set<entity_name_t> to_blocklist;
   for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
       i != dirty_sessions.end(); ++i) {
     const entity_name_t name = *i;
@@ -420,13 +431,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
        session->is_stale() ||
        session->is_killing()) {
       dout(20) << "  " << name << dendl;
-      // Serialize K
-      CachedStackStringStream css;
-      *css << name;
 
       // Serialize V
       bufferlist bl;
-      session->info.encode(bl, mds->mdsmap->get_up_features());
+      if (!validate_and_encode_session(mds, session, bl)) {
+       derr << __func__ << ": session (" << name << ") exceeds"
+            << " sesion metadata threshold - blocklisting" << dendl;
+       to_blocklist.emplace(name);
+       continue;
+      }
+
+      // Serialize K
+      CachedStackStringStream css;
+      *css << name;
 
       // Add to RADOS op
       to_set[std::string(css->strv())] = bl;
@@ -461,6 +478,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
                        0,
                        new C_OnFinisher(new C_IO_SM_Save(this, version),
                                         mds->finisher));
+  apply_blocklist(to_blocklist);
 }
 
 void SessionMap::_save_finish(version_t v)
@@ -826,7 +844,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
 {
   ceph_assert(gather_bld != NULL);
 
-  std::vector<entity_name_t> write_sessions;
+  std::set<entity_name_t> to_blocklist;
+  std::map<entity_name_t, bufferlist> write_sessions;
 
   // Decide which sessions require a write
   for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
@@ -851,13 +870,24 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
       // need to pre-empt that.
       continue;
     }
+
+    // Serialize V
+    bufferlist bl;
+    if (!validate_and_encode_session(mds, session, bl)) {
+      derr << __func__ << ": session (" << session_id << ") exceeds"
+          << " sesion metadata threshold - blocklisting" << dendl;
+      to_blocklist.emplace(session_id);
+      continue;
+    }
+
     // Okay, passed all our checks, now we write
     // this session out.  The version we write
     // into the OMAP may now be higher-versioned
     // than the version in the header, but that's
     // okay because it's never a problem to have
     // an overly-fresh copy of a session.
-    write_sessions.push_back(*i);
+    write_sessions.emplace(session_id, std::move(bl));
+    session->clear_dirty_completed_requests();
   }
 
   dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
@@ -865,21 +895,15 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
   // Batch writes into mds_sessionmap_keys_per_op
   const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
   map<string, bufferlist> to_set;
-  for (uint32_t i = 0; i < write_sessions.size(); ++i) {
-    const entity_name_t &session_id = write_sessions[i];
-    Session *session = session_map[session_id];
-    session->clear_dirty_completed_requests();
 
+  uint32_t i = 0;
+  for (auto &[session_id, bl] : write_sessions) {
     // Serialize K
     CachedStackStringStream css;
     *css << session_id;
 
-    // Serialize V
-    bufferlist bl;
-    session->info.encode(bl, mds->mdsmap->get_up_features());
-
     // Add to RADOS op
-    to_set[css->str()] = bl;
+    to_set[css->str()] = std::move(bl);
 
     // Complete this write transaction?
     if (i == write_sessions.size() - 1
@@ -898,7 +922,10 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
                              new C_IO_SM_Save_One(this, on_safe),
                              mds->finisher));
     }
+    ++i;
   }
+
+  apply_blocklist(to_blocklist);
 }
 
 // =================
@@ -1112,6 +1139,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
     };
     apply_to_open_sessions(mut);
   }
+
+  if (changed.count("mds_session_metadata_threshold")) {
+    mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
+  }
 }
 
 void SessionMap::update_average_session_age() {
@@ -1123,6 +1154,20 @@ void SessionMap::update_average_session_age() {
   logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
 }
 
+void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
+  if (victims.empty()) {
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
+  for (auto &victim : victims) {
+    CachedStackStringStream css;
+    mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
+                     gather.new_sub());
+  }
+  gather.activate();
+}
+
 int SessionFilter::parse(
     const std::vector<std::string> &args,
     std::ostream *ss)
index 067e1474cc3345c78accba5b08329c8ac2ac33ab..c1c28a8455badc9886daa3f136a488a73cb911f0 100644 (file)
@@ -594,7 +594,7 @@ protected:
 class SessionMap : public SessionMapStore {
 public:
   SessionMap() = delete;
-  explicit SessionMap(MDSRank *m) : mds(m) {}
+  explicit SessionMap(MDSRank *m);
 
   ~SessionMap() override
   {
@@ -843,6 +843,11 @@ private:
   }
 
   time avg_birth_time = clock::zero();
+
+  size_t mds_session_metadata_threshold;
+
+  bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
+  void apply_blocklist(const std::set<entity_name_t>& victims);
 };
 
 std::ostream& operator<<(std::ostream &out, const Session &s);