]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: blocklist clients with "bloated" session metadata
authorVenky Shankar <vshankar@redhat.com>
Fri, 11 Aug 2023 08:36:52 +0000 (04:36 -0400)
committerVenky Shankar <vshankar@redhat.com>
Fri, 8 Sep 2023 05:50:12 +0000 (11:20 +0530)
Buggy clients (or maybe a MDS bug) causes a huge buildup of
`completed_requests` metadata in its session information.
This could cause the MDS to go read-only when its flushing
session metadata to the journal since the bloated metadata
causes the ODSOp payload to exceed the maximum write size.

Blocklist such clients so as to allow the MDS to continue
servicing requests.

Fixes: http://tracker.ceph.com/issues/61947
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit bc6814d72a9fbec9c41ed75aee2314666cfca34b)

 Conflicts:
src/common/options/mds.yaml.in
src/mds/MDSRank.cc

Straight-forward conflicts due to missing configs.

src/common/options/mds.yaml.in
src/mds/MDSRank.cc
src/mds/SessionMap.cc
src/mds/SessionMap.h

index 90c5df4a101057066ecaa2ae96717983aab443c2..96bb62eada2a01e3bbae15176070f8c98efcd15b 100644 (file)
@@ -1491,3 +1491,13 @@ options:
   - mds
   flags:
   - runtime
+- name: mds_session_metadata_threshold
+  type: size
+  level: advanced
+  desc: Evict non-advancing client-tid sessions exceeding the config size.
+  long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
+  default: 16_M
+  services:
+  - mds
+  flags:
+  - runtime
index 03a78cc32211038090f1b4d102f628b6ea53dad1..f2edc92199822d97448ad8004e7b0466aaf0bdf8 100644 (file)
@@ -3774,6 +3774,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
     "mds_symlink_recovery",
     "mds_inject_rename_corrupt_dentry_first",
     "mds_inject_journal_corrupt_dentry_first",
+    "mds_session_metadata_threshold",
     NULL
   };
   return KEYS;
index c21ac469356e0f4f9ceef415c191339d6093bdb0..873dedbe68dcf975a574b977bd05e660d5800af2 100644 (file)
@@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase
 };
 };
 
+SessionMap::SessionMap(MDSRank *m)
+  : mds(m),
+    mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
+}
+
 void SessionMap::register_perfcounters()
 {
   PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
@@ -375,6 +380,11 @@ public:
 };
 }
 
+bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
+  session->info.encode(bl, mds->mdsmap->get_up_features());
+  return bl.length() < mds_session_metadata_threshold;
+}
+
 void SessionMap::save(MDSContext *onsave, version_t needv)
 {
   dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
@@ -410,6 +420,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
 
   dout(20) << " updating keys:" << dendl;
   map<string, bufferlist> to_set;
+  std::set<entity_name_t> to_blocklist;
   for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
       i != dirty_sessions.end(); ++i) {
     const entity_name_t name = *i;
@@ -420,13 +431,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
        session->is_stale() ||
        session->is_killing()) {
       dout(20) << "  " << name << dendl;
-      // Serialize K
-      CachedStackStringStream css;
-      *css << name;
 
       // Serialize V
       bufferlist bl;
-      session->info.encode(bl, mds->mdsmap->get_up_features());
+      if (!validate_and_encode_session(mds, session, bl)) {
+       derr << __func__ << ": session (" << name << ") exceeds"
+            << " sesion metadata threshold - blocklisting" << dendl;
+       to_blocklist.emplace(name);
+       continue;
+      }
+
+      // Serialize K
+      CachedStackStringStream css;
+      *css << name;
 
       // Add to RADOS op
       to_set[std::string(css->strv())] = bl;
@@ -461,6 +478,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
                        0,
                        new C_OnFinisher(new C_IO_SM_Save(this, version),
                                         mds->finisher));
+  apply_blocklist(to_blocklist);
 }
 
 void SessionMap::_save_finish(version_t v)
@@ -826,7 +844,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
 {
   ceph_assert(gather_bld != NULL);
 
-  std::vector<entity_name_t> write_sessions;
+  std::set<entity_name_t> to_blocklist;
+  std::map<entity_name_t, bufferlist> write_sessions;
 
   // Decide which sessions require a write
   for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
@@ -851,13 +870,24 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
       // need to pre-empt that.
       continue;
     }
+
+    // Serialize V
+    bufferlist bl;
+    if (!validate_and_encode_session(mds, session, bl)) {
+      derr << __func__ << ": session (" << session_id << ") exceeds"
+          << " sesion metadata threshold - blocklisting" << dendl;
+      to_blocklist.emplace(session_id);
+      continue;
+    }
+
     // Okay, passed all our checks, now we write
     // this session out.  The version we write
     // into the OMAP may now be higher-versioned
     // than the version in the header, but that's
     // okay because it's never a problem to have
     // an overly-fresh copy of a session.
-    write_sessions.push_back(*i);
+    write_sessions.emplace(session_id, std::move(bl));
+    session->clear_dirty_completed_requests();
   }
 
   dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
@@ -865,21 +895,15 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
   // Batch writes into mds_sessionmap_keys_per_op
   const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
   map<string, bufferlist> to_set;
-  for (uint32_t i = 0; i < write_sessions.size(); ++i) {
-    const entity_name_t &session_id = write_sessions[i];
-    Session *session = session_map[session_id];
-    session->clear_dirty_completed_requests();
 
+  uint32_t i = 0;
+  for (auto &[session_id, bl] : write_sessions) {
     // Serialize K
     CachedStackStringStream css;
     *css << session_id;
 
-    // Serialize V
-    bufferlist bl;
-    session->info.encode(bl, mds->mdsmap->get_up_features());
-
     // Add to RADOS op
-    to_set[css->str()] = bl;
+    to_set[css->str()] = std::move(bl);
 
     // Complete this write transaction?
     if (i == write_sessions.size() - 1
@@ -898,7 +922,10 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
                              new C_IO_SM_Save_One(this, on_safe),
                              mds->finisher));
     }
+    ++i;
   }
+
+  apply_blocklist(to_blocklist);
 }
 
 // =================
@@ -1112,6 +1139,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
     };
     apply_to_open_sessions(mut);
   }
+
+  if (changed.count("mds_session_metadata_threshold")) {
+    mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
+  }
 }
 
 void SessionMap::update_average_session_age() {
@@ -1123,6 +1154,20 @@ void SessionMap::update_average_session_age() {
   logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
 }
 
+void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
+  if (victims.empty()) {
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
+  for (auto &victim : victims) {
+    CachedStackStringStream css;
+    mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
+                     gather.new_sub());
+  }
+  gather.activate();
+}
+
 int SessionFilter::parse(
     const std::vector<std::string> &args,
     std::ostream *ss)
index 067e1474cc3345c78accba5b08329c8ac2ac33ab..c1c28a8455badc9886daa3f136a488a73cb911f0 100644 (file)
@@ -594,7 +594,7 @@ protected:
 class SessionMap : public SessionMapStore {
 public:
   SessionMap() = delete;
-  explicit SessionMap(MDSRank *m) : mds(m) {}
+  explicit SessionMap(MDSRank *m);
 
   ~SessionMap() override
   {
@@ -843,6 +843,11 @@ private:
   }
 
   time avg_birth_time = clock::zero();
+
+  size_t mds_session_metadata_threshold;
+
+  bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
+  void apply_blocklist(const std::set<entity_name_t>& victims);
 };
 
 std::ostream& operator<<(std::ostream &out, const Session &s);