]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: blocklist clients with "bloated" session metadata
authorVenky Shankar <vshankar@redhat.com>
Fri, 11 Aug 2023 08:36:52 +0000 (04:36 -0400)
committerVenky Shankar <vshankar@redhat.com>
Mon, 9 Oct 2023 13:14:57 +0000 (18:44 +0530)
Buggy clients (or maybe a MDS bug) causes a huge buildup of
`completed_requests` metadata in its session information.
This could cause the MDS to go read-only when its flushing
session metadata to the journal since the bloated metadata
causes the ODSOp payload to exceed the maximum write size.

Blocklist such clients so as to allow the MDS to continue
servicing requests.

Fixes: http://tracker.ceph.com/issues/61947
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit bc6814d72a9fbec9c41ed75aee2314666cfca34b)

 Conflicts:
src/common/options/mds.yaml.in
src/mds/MDSRank.cc

Straight-forward conflict is mds.yaml.in due to missing configs in
reef and likewise in MDSRank.cc

src/common/options/mds.yaml.in
src/mds/MDSRank.cc
src/mds/SessionMap.cc
src/mds/SessionMap.h

index 44a07941f617115132950039de914d636887abc9..6eb0702fcdda183ec516ee65d6a2d7c0c9632831 100644 (file)
@@ -1524,3 +1524,13 @@ options:
   - mds
   flags:
   - runtime
+- name: mds_session_metadata_threshold
+  type: size
+  level: advanced
+  desc: Evict non-advancing client-tid sessions exceeding the config size.
+  long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
+  default: 16_M
+  services:
+  - mds
+  flags:
+  - runtime
index 5141a8515ca971abd41f4c069d9da74a22254b1c..dc6d0b8988bc24daaec9ec6687f7de69b15a1812 100644 (file)
@@ -3795,6 +3795,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
     "mds_extraordinary_events_dump_interval",
     "mds_inject_rename_corrupt_dentry_first",
     "mds_inject_journal_corrupt_dentry_first",
+    "mds_session_metadata_threshold",
     NULL
   };
   return KEYS;
index 2364c973e679370d0d8824a7ec6d5008b40d1a4b..41b35cdb0732ff9311e2267decc54f231624d662 100644 (file)
@@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase
 };
 };
 
+SessionMap::SessionMap(MDSRank *m)
+  : mds(m),
+    mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
+}
+
 void SessionMap::register_perfcounters()
 {
   PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
@@ -375,6 +380,11 @@ public:
 };
 }
 
+bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
+  session->info.encode(bl, mds->mdsmap->get_up_features());
+  return bl.length() < mds_session_metadata_threshold;
+}
+
 void SessionMap::save(MDSContext *onsave, version_t needv)
 {
   dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
@@ -410,6 +420,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
 
   dout(20) << " updating keys:" << dendl;
   map<string, bufferlist> to_set;
+  std::set<entity_name_t> to_blocklist;
   for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
       i != dirty_sessions.end(); ++i) {
     const entity_name_t name = *i;
@@ -420,13 +431,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
        session->is_stale() ||
        session->is_killing()) {
       dout(20) << "  " << name << dendl;
-      // Serialize K
-      CachedStackStringStream css;
-      *css << name;
 
       // Serialize V
       bufferlist bl;
-      session->info.encode(bl, mds->mdsmap->get_up_features());
+      if (!validate_and_encode_session(mds, session, bl)) {
+       derr << __func__ << ": session (" << name << ") exceeds"
+            << " sesion metadata threshold - blocklisting" << dendl;
+       to_blocklist.emplace(name);
+       continue;
+      }
+
+      // Serialize K
+      CachedStackStringStream css;
+      *css << name;
 
       // Add to RADOS op
       to_set[std::string(css->strv())] = bl;
@@ -461,6 +478,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
                        0,
                        new C_OnFinisher(new C_IO_SM_Save(this, version),
                                         mds->finisher));
+  apply_blocklist(to_blocklist);
 }
 
 void SessionMap::_save_finish(version_t v)
@@ -823,7 +841,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
 {
   ceph_assert(gather_bld != NULL);
 
-  std::vector<entity_name_t> write_sessions;
+  std::set<entity_name_t> to_blocklist;
+  std::map<entity_name_t, bufferlist> write_sessions;
 
   // Decide which sessions require a write
   for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
@@ -848,13 +867,24 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
       // need to pre-empt that.
       continue;
     }
+
+    // Serialize V
+    bufferlist bl;
+    if (!validate_and_encode_session(mds, session, bl)) {
+      derr << __func__ << ": session (" << session_id << ") exceeds"
+          << " sesion metadata threshold - blocklisting" << dendl;
+      to_blocklist.emplace(session_id);
+      continue;
+    }
+
     // Okay, passed all our checks, now we write
     // this session out.  The version we write
     // into the OMAP may now be higher-versioned
     // than the version in the header, but that's
     // okay because it's never a problem to have
     // an overly-fresh copy of a session.
-    write_sessions.push_back(*i);
+    write_sessions.emplace(session_id, std::move(bl));
+    session->clear_dirty_completed_requests();
   }
 
   dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
@@ -862,21 +892,15 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
   // Batch writes into mds_sessionmap_keys_per_op
   const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
   map<string, bufferlist> to_set;
-  for (uint32_t i = 0; i < write_sessions.size(); ++i) {
-    const entity_name_t &session_id = write_sessions[i];
-    Session *session = session_map[session_id];
-    session->clear_dirty_completed_requests();
 
+  uint32_t i = 0;
+  for (auto &[session_id, bl] : write_sessions) {
     // Serialize K
     CachedStackStringStream css;
     *css << session_id;
 
-    // Serialize V
-    bufferlist bl;
-    session->info.encode(bl, mds->mdsmap->get_up_features());
-
     // Add to RADOS op
-    to_set[css->str()] = bl;
+    to_set[css->str()] = std::move(bl);
 
     // Complete this write transaction?
     if (i == write_sessions.size() - 1
@@ -895,7 +919,10 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
                              new C_IO_SM_Save_One(this, on_safe),
                              mds->finisher));
     }
+    ++i;
   }
+
+  apply_blocklist(to_blocklist);
 }
 
 // =================
@@ -1109,6 +1136,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
     };
     apply_to_open_sessions(mut);
   }
+
+  if (changed.count("mds_session_metadata_threshold")) {
+    mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
+  }
 }
 
 void SessionMap::update_average_session_age() {
@@ -1120,6 +1151,20 @@ void SessionMap::update_average_session_age() {
   logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
 }
 
+void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
+  if (victims.empty()) {
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
+  for (auto &victim : victims) {
+    CachedStackStringStream css;
+    mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
+                     gather.new_sub());
+  }
+  gather.activate();
+}
+
 int SessionFilter::parse(
     const std::vector<std::string> &args,
     std::ostream *ss)
index e59f7f26484501083d98d76ea31e5415c75feb25..426a9ac9354fea84ad1a85d1b995b68dff0977f2 100644 (file)
@@ -589,7 +589,7 @@ protected:
 class SessionMap : public SessionMapStore {
 public:
   SessionMap() = delete;
-  explicit SessionMap(MDSRank *m) : mds(m) {}
+  explicit SessionMap(MDSRank *m);
 
   ~SessionMap() override
   {
@@ -838,6 +838,11 @@ private:
   }
 
   time avg_birth_time = clock::zero();
+
+  size_t mds_session_metadata_threshold;
+
+  bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
+  void apply_blocklist(const std::set<entity_name_t>& victims);
 };
 
 std::ostream& operator<<(std::ostream &out, const Session &s);