From: Dhairya Parmar Date: Tue, 21 Mar 2023 12:02:37 +0000 (+0530) Subject: mds: do not evict client on laggy osds X-Git-Tag: v18.2.4~418^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a8f56f01a2c72527910f07239a564806d2114017;p=ceph.git mds: do not evict client on laggy osds A client might get unresponsive/laggy due to laggy OSD(s). This change provides us a way to defer client eviction in such scenarios also adds helpers: - get_laggy_clients() - clear_laggy_clients() and call clear_laggy_clients() before calling related Server methods Fixes: https://tracker.ceph.com/issues/58023 Signed-off-by: Dhairya Parmar (cherry picked from commit 31a8d03a2ea3f65dc946c2ba0be9921a2087801f) --- diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index d7f1d41226e..d53abb21927 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -744,6 +744,7 @@ void MDSRankDispatcher::tick() // ... if (is_clientreplay() || is_active() || is_stopping()) { + server->clear_laggy_clients(); server->find_idle_sessions(); server->evict_cap_revoke_non_responders(); locker->tick(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 102b2f468da..5855923ecfe 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1136,10 +1136,12 @@ void Server::find_idle_sessions() return; } - std::vector to_evict; - bool defer_session_stale = g_conf().get_val("mds_defer_session_stale"); const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN); + bool defer_client_eviction = + g_conf().get_val("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }); if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) { std::vector new_stale; @@ -1164,7 +1166,7 @@ void Server::find_idle_sessions() dout(20) << "evicting session " << session->info.inst << " since autoclose " "has arrived" << dendl; // evict session without marking it stale - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); continue; } @@ -1193,7 +1195,7 @@ void Server::find_idle_sessions() } // do not go through stale, evict it directly. - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); } else { dout(10) << "new stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl; @@ -1209,7 +1211,7 @@ void Server::find_idle_sessions() auto m = make_message(CEPH_SESSION_STALE); mds->send_message_client(m, session); } else { - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); } } } @@ -1228,11 +1230,21 @@ void Server::find_idle_sessions() << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl; break; } - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); } } - for (auto session: to_evict) { + // don't evict client(s) if osds are laggy + if(defer_client_eviction && !laggy_clients.empty()) { + dout(5) << "Detected " << laggy_clients.size() + << " laggy clients, possibly due to laggy OSDs." + " Eviction is skipped until the OSDs return to normal." + << dendl; + return; + } + + for (auto client: laggy_clients) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); if (session->is_importing()) { dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl; continue; @@ -1259,6 +1271,20 @@ void Server::evict_cap_revoke_non_responders() { } auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout); + // don't evict client(s) if osds are laggy + bool defer_client_eviction = + g_conf().get_val("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }) + && to_evict.size(); + if(defer_client_eviction) { + laggy_clients.insert(to_evict.begin(), to_evict.end()); + dout(0) << "Detected " << to_evict.size() + << " unresponsive clients, possibly due to laggy OSDs." + " Eviction is skipped until the OSDs return to normal." + << dendl; + return; + } for (auto const &client: to_evict) { mds->clog->warn() << "client id " << client << " has not responded to" diff --git a/src/mds/Server.h b/src/mds/Server.h index ef24b27fcfb..98c4aa27bbf 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -335,6 +335,13 @@ public: std::set client_reclaim_gather; + std::set get_laggy_clients() const { + return laggy_clients; + } + void clear_laggy_clients() { + laggy_clients.clear(); + } + const bufferlist& get_snap_trace(Session *session, SnapRealm *realm) const; const bufferlist& get_snap_trace(client_t client, SnapRealm *realm) const; @@ -560,6 +567,9 @@ private: size_t alternate_name_max = g_conf().get_val("mds_alternate_name_max"); size_t fscrypt_last_block_max_size = g_conf().get_val("mds_fscrypt_last_block_max_size"); + + // record laggy clients due to laggy OSDs + std::set laggy_clients; }; static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {