From bc6814d72a9fbec9c41ed75aee2314666cfca34b Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Fri, 11 Aug 2023 04:36:52 -0400 Subject: [PATCH] mds: blocklist clients with "bloated" session metadata Buggy clients (or maybe a MDS bug) causes a huge buildup of `completed_requests` metadata in its session information. This could cause the MDS to go read-only when its flushing session metadata to the journal since the bloated metadata causes the ODSOp payload to exceed the maximum write size. Blocklist such clients so as to allow the MDS to continue servicing requests. Fixes: http://tracker.ceph.com/issues/61947 Signed-off-by: Venky Shankar --- src/common/options/mds.yaml.in | 10 +++++ src/mds/MDSRank.cc | 1 + src/mds/SessionMap.cc | 75 +++++++++++++++++++++++++++------- src/mds/SessionMap.h | 7 +++- 4 files changed, 77 insertions(+), 16 deletions(-) diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index d320ce139db..5a06326f9fc 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1577,3 +1577,13 @@ options: - mds flags: - runtime +- name: mds_session_metadata_threshold + type: size + level: advanced + desc: Evict non-advancing client-tid sessions exceeding the config size. + long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow. + default: 16_M + services: + - mds + flags: + - runtime diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 31ea7f6710f..cc810d7e39a 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_session_cap_acquisition_throttle", "mds_session_max_caps_throttle_ratio", "mds_symlink_recovery", + "mds_session_metadata_threshold", NULL }; return KEYS; diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index c21ac469356..873dedbe68d 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase }; }; +SessionMap::SessionMap(MDSRank *m) + : mds(m), + mds_session_metadata_threshold(g_conf().get_val("mds_session_metadata_threshold")) { +} + void SessionMap::register_perfcounters() { PerfCountersBuilder plb(g_ceph_context, "mds_sessions", @@ -375,6 +380,11 @@ public: }; } +bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) { + session->info.encode(bl, mds->mdsmap->get_up_features()); + return bl.length() < mds_session_metadata_threshold; +} + void SessionMap::save(MDSContext *onsave, version_t needv) { dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl; @@ -410,6 +420,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv) dout(20) << " updating keys:" << dendl; map to_set; + std::set to_blocklist; for(std::set::iterator i = dirty_sessions.begin(); i != dirty_sessions.end(); ++i) { const entity_name_t name = *i; @@ -420,13 +431,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv) session->is_stale() || session->is_killing()) { dout(20) << " " << name << dendl; - // Serialize K - CachedStackStringStream css; - *css << name; // Serialize V bufferlist bl; - session->info.encode(bl, mds->mdsmap->get_up_features()); + if (!validate_and_encode_session(mds, session, bl)) { + derr << __func__ << ": session (" << name << ") exceeds" + << " sesion metadata threshold - blocklisting" << dendl; + to_blocklist.emplace(name); + continue; + } + + // Serialize K + CachedStackStringStream css; + *css << name; // Add to RADOS op to_set[std::string(css->strv())] = bl; @@ -461,6 +478,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv) 0, new C_OnFinisher(new C_IO_SM_Save(this, version), mds->finisher)); + apply_blocklist(to_blocklist); } void SessionMap::_save_finish(version_t v) @@ -826,7 +844,8 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, { ceph_assert(gather_bld != NULL); - std::vector write_sessions; + std::set to_blocklist; + std::map write_sessions; // Decide which sessions require a write for (std::set::iterator i = tgt_sessions.begin(); @@ -851,13 +870,24 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, // need to pre-empt that. continue; } + + // Serialize V + bufferlist bl; + if (!validate_and_encode_session(mds, session, bl)) { + derr << __func__ << ": session (" << session_id << ") exceeds" + << " sesion metadata threshold - blocklisting" << dendl; + to_blocklist.emplace(session_id); + continue; + } + // Okay, passed all our checks, now we write // this session out. The version we write // into the OMAP may now be higher-versioned // than the version in the header, but that's // okay because it's never a problem to have // an overly-fresh copy of a session. - write_sessions.push_back(*i); + write_sessions.emplace(session_id, std::move(bl)); + session->clear_dirty_completed_requests(); } dout(4) << __func__ << ": writing " << write_sessions.size() << dendl; @@ -865,21 +895,15 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, // Batch writes into mds_sessionmap_keys_per_op const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op; map to_set; - for (uint32_t i = 0; i < write_sessions.size(); ++i) { - const entity_name_t &session_id = write_sessions[i]; - Session *session = session_map[session_id]; - session->clear_dirty_completed_requests(); + uint32_t i = 0; + for (auto &[session_id, bl] : write_sessions) { // Serialize K CachedStackStringStream css; *css << session_id; - // Serialize V - bufferlist bl; - session->info.encode(bl, mds->mdsmap->get_up_features()); - // Add to RADOS op - to_set[css->str()] = bl; + to_set[css->str()] = std::move(bl); // Complete this write transaction? if (i == write_sessions.size() - 1 @@ -898,7 +922,10 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, new C_IO_SM_Save_One(this, on_safe), mds->finisher)); } + ++i; } + + apply_blocklist(to_blocklist); } // ================= @@ -1112,6 +1139,10 @@ void SessionMap::handle_conf_change(const std::set& changed) }; apply_to_open_sessions(mut); } + + if (changed.count("mds_session_metadata_threshold")) { + mds_session_metadata_threshold = g_conf().get_val("mds_session_metadata_threshold"); + } } void SessionMap::update_average_session_age() { @@ -1123,6 +1154,20 @@ void SessionMap::update_average_session_age() { logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime); } +void SessionMap::apply_blocklist(const std::set& victims) { + if (victims.empty()) { + return; + } + + C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop); + for (auto &victim : victims) { + CachedStackStringStream css; + mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css, + gather.new_sub()); + } + gather.activate(); +} + int SessionFilter::parse( const std::vector &args, std::ostream *ss) diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index 067e1474cc3..c1c28a8455b 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -594,7 +594,7 @@ protected: class SessionMap : public SessionMapStore { public: SessionMap() = delete; - explicit SessionMap(MDSRank *m) : mds(m) {} + explicit SessionMap(MDSRank *m); ~SessionMap() override { @@ -843,6 +843,11 @@ private: } time avg_birth_time = clock::zero(); + + size_t mds_session_metadata_threshold; + + bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl); + void apply_blocklist(const std::set& victims); }; std::ostream& operator<<(std::ostream &out, const Session &s); -- 2.39.5