]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: blocklist clients with "bloated" session metadata
authorVenky Shankar <vshankar@redhat.com>
Fri, 11 Aug 2023 08:36:52 +0000 (04:36 -0400)
committerVenky Shankar <vshankar@redhat.com>
Fri, 10 Nov 2023 09:55:28 +0000 (15:25 +0530)
Buggy clients (or maybe a MDS bug) causes a huge buildup of
`completed_requests` metadata in its session information.
This could cause the MDS to go read-only when its flushing
session metadata to the journal since the bloated metadata
causes the ODSOp payload to exceed the maximum write size.

Blocklist such clients so as to allow the MDS to continue
servicing requests.

Fixes: http://tracker.ceph.com/issues/61947
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit bc6814d72a9fbec9c41ed75aee2314666cfca34b)

 Conflicts:
src/common/options/mds.yaml.in
src/mds/MDSRank.cc

Pacific uses old-style config file (src/common/options.cc), so
adjust to that.

src/common/options.cc
src/mds/MDSRank.cc
src/mds/SessionMap.cc
src/mds/SessionMap.h

index 3dd1a7f73e88a8c5de9a042f659626d2dcd1d689..6f1cec24cc84317bb9b2f1792ee1a8f3529d85d6 100644 (file)
@@ -9030,8 +9030,13 @@ std::vector<Option> get_mds_options() {
      .set_default(true)
      .set_flag(Option::FLAG_RUNTIME)
      .set_description("Do not evict client if any osd is laggy")
-     .set_long_description("Laggy OSD(s) can make clients laggy or unresponsive, this can lead to their eviction, this option once enabled can help defer client eviction.")
+     .set_long_description("Laggy OSD(s) can make clients laggy or unresponsive, this can lead to their eviction, this option once enabled can help defer client eviction."),
 
+    Option("mds_session_metadata_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+     .set_default(16_M)
+     .set_flag(Option::FLAG_RUNTIME)
+     .set_description("Evict non-advancing client-tid sessions exceeding the config size.")
+     .set_long_description("Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.")
   });
 }
 
index 4a5f933589a1a2d90012f76b8017fe24f6b6a18f..bf85e848ee252a55572eff5d683a6be0638c13c9 100644 (file)
@@ -3785,6 +3785,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
     "mds_dir_max_entries",
     "mds_inject_rename_corrupt_dentry_first",
     "mds_inject_journal_corrupt_dentry_first",
+    "mds_session_metadata_threshold",
     NULL
   };
   return KEYS;
index eecf54e952440ad0f6d4c7117704908af035b98f..7d994a653a684ff5e300fbb1d4070f9e7b7cebb8 100644 (file)
@@ -43,6 +43,11 @@ class SessionMapIOContext : public MDSIOContextBase
 };
 };
 
+SessionMap::SessionMap(MDSRank *m)
+  : mds(m),
+    mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
+}
+
 void SessionMap::register_perfcounters()
 {
   PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
@@ -373,6 +378,11 @@ public:
 };
 }
 
+bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
+  session->info.encode(bl, mds->mdsmap->get_up_features());
+  return bl.length() < mds_session_metadata_threshold;
+}
+
 void SessionMap::save(MDSContext *onsave, version_t needv)
 {
   dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
@@ -408,6 +418,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
 
   dout(20) << " updating keys:" << dendl;
   map<string, bufferlist> to_set;
+  std::set<entity_name_t> to_blocklist;
   for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
       i != dirty_sessions.end(); ++i) {
     const entity_name_t name = *i;
@@ -418,13 +429,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
        session->is_stale() ||
        session->is_killing()) {
       dout(20) << "  " << name << dendl;
-      // Serialize K
-      CachedStackStringStream css;
-      *css << name;
 
       // Serialize V
       bufferlist bl;
-      session->info.encode(bl, mds->mdsmap->get_up_features());
+      if (!validate_and_encode_session(mds, session, bl)) {
+       derr << __func__ << ": session (" << name << ") exceeds"
+            << " sesion metadata threshold - blocklisting" << dendl;
+       to_blocklist.emplace(name);
+       continue;
+      }
+
+      // Serialize K
+      CachedStackStringStream css;
+      *css << name;
 
       // Add to RADOS op
       to_set[std::string(css->strv())] = bl;
@@ -459,6 +476,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
                        0,
                        new C_OnFinisher(new C_IO_SM_Save(this, version),
                                         mds->finisher));
+  apply_blocklist(to_blocklist);
 }
 
 void SessionMap::_save_finish(version_t v)
@@ -824,7 +842,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
 {
   ceph_assert(gather_bld != NULL);
 
-  std::vector<entity_name_t> write_sessions;
+  std::set<entity_name_t> to_blocklist;
+  std::map<entity_name_t, bufferlist> write_sessions;
 
   // Decide which sessions require a write
   for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
@@ -849,13 +868,24 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
       // need to pre-empt that.
       continue;
     }
+
+    // Serialize V
+    bufferlist bl;
+    if (!validate_and_encode_session(mds, session, bl)) {
+      derr << __func__ << ": session (" << session_id << ") exceeds"
+          << " sesion metadata threshold - blocklisting" << dendl;
+      to_blocklist.emplace(session_id);
+      continue;
+    }
+
     // Okay, passed all our checks, now we write
     // this session out.  The version we write
     // into the OMAP may now be higher-versioned
     // than the version in the header, but that's
     // okay because it's never a problem to have
     // an overly-fresh copy of a session.
-    write_sessions.push_back(*i);
+    write_sessions.emplace(session_id, std::move(bl));
+    session->clear_dirty_completed_requests();
   }
 
   dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
@@ -863,21 +893,15 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
   // Batch writes into mds_sessionmap_keys_per_op
   const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
   map<string, bufferlist> to_set;
-  for (uint32_t i = 0; i < write_sessions.size(); ++i) {
-    const entity_name_t &session_id = write_sessions[i];
-    Session *session = session_map[session_id];
-    session->clear_dirty_completed_requests();
 
+  uint32_t i = 0;
+  for (auto &[session_id, bl] : write_sessions) {
     // Serialize K
     CachedStackStringStream css;
     *css << session_id;
 
-    // Serialize V
-    bufferlist bl;
-    session->info.encode(bl, mds->mdsmap->get_up_features());
-
     // Add to RADOS op
-    to_set[css->str()] = bl;
+    to_set[css->str()] = std::move(bl);
 
     // Complete this write transaction?
     if (i == write_sessions.size() - 1
@@ -896,7 +920,10 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
                              new C_IO_SM_Save_One(this, on_safe),
                              mds->finisher));
     }
+    ++i;
   }
+
+  apply_blocklist(to_blocklist);
 }
 
 // =================
@@ -1110,6 +1137,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
     };
     apply_to_open_sessions(mut);
   }
+
+  if (changed.count("mds_session_metadata_threshold")) {
+    mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
+  }
 }
 
 void SessionMap::update_average_session_age() {
@@ -1121,6 +1152,20 @@ void SessionMap::update_average_session_age() {
   logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
 }
 
+void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
+  if (victims.empty()) {
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
+  for (auto &victim : victims) {
+    CachedStackStringStream css;
+    mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
+                     gather.new_sub());
+  }
+  gather.activate();
+}
+
 int SessionFilter::parse(
     const std::vector<std::string> &args,
     std::ostream *ss)
index 067e1474cc3345c78accba5b08329c8ac2ac33ab..c1c28a8455badc9886daa3f136a488a73cb911f0 100644 (file)
@@ -594,7 +594,7 @@ protected:
 class SessionMap : public SessionMapStore {
 public:
   SessionMap() = delete;
-  explicit SessionMap(MDSRank *m) : mds(m) {}
+  explicit SessionMap(MDSRank *m);
 
   ~SessionMap() override
   {
@@ -843,6 +843,11 @@ private:
   }
 
   time avg_birth_time = clock::zero();
+
+  size_t mds_session_metadata_threshold;
+
+  bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
+  void apply_blocklist(const std::set<entity_name_t>& victims);
 };
 
 std::ostream& operator<<(std::ostream &out, const Session &s);