From 5ade254c15d0b9f96cdd5cd2d5f7861ef2232812 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 2 Aug 2021 11:53:37 +0800 Subject: [PATCH] mds: switch mds_lock to fair mutex The implementations of the Mutex (e.g. std::mutex in C++) do not guarantee fairness, they do not guarantee that the lock will be acquired by threads in the order that they called the lock(). In most case this works well, but in corner case in the Finisher thread in mds daemon, which may call more than one complete()s once the mdlog flushing succeeds, after the mdlog flushing is done it will call the queued complete callbacks and the Finisher thread could always successfully acquire the mds_lock in successive complete callbakcs even there may have other threads already being stuck waiting the mds_lock. This will make the other threads starve and if they are client's requests, it will cause several or even tens of seconds long delay for user's operations. This will switch the mds_lock to fair mutex and it could make sure that the all the mds_lock waiters are in FIFO order and the Finisher thread won't hold the mds_lock that long. At the same time, if the finisher thread has many completes needed to run the fair mutex could guarantee that the finisher won't be scheduled out due to fair mutex unlock() if no any other mds_lock waiter queued. Fixes: https://tracker.ceph.com/issues/51722 Signed-off-by: Xiubo Li --- src/mds/MDSDaemon.h | 5 +++-- src/mds/MDSRank.cc | 8 ++++---- src/mds/MDSRank.h | 16 +++++++++------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h index b0491f61caf70..1fe872ba9675b 100644 --- a/src/mds/MDSDaemon.h +++ b/src/mds/MDSDaemon.h @@ -25,6 +25,7 @@ #include "common/LogClient.h" #include "common/ceph_mutex.h" +#include "common/fair_mutex.h" #include "common/Timer.h" #include "include/Context.h" #include "include/types.h" @@ -72,10 +73,10 @@ class MDSDaemon : public Dispatcher { * also check the `stopping` flag. If stopping is true, you * must either do nothing and immediately drop the lock, or * never drop the lock again (i.e. call respawn()) */ - ceph::mutex mds_lock = ceph::make_mutex("MDSDaemon::mds_lock"); + ceph::fair_mutex mds_lock{"MDSDaemon::mds_lock"}; bool stopping = false; - SafeTimer timer; + class CommonSafeTimer timer; std::string gss_ktfile_client{}; int orig_argc; diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 2b40cfb338950..91d58a19df5db 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -482,9 +482,9 @@ private: MDSRank::MDSRank( mds_rank_t whoami_, - ceph::mutex &mds_lock_, + ceph::fair_mutex &mds_lock_, LogChannelRef &clog_, - SafeTimer &timer_, + CommonSafeTimer &timer_, Beacon &beacon_, std::unique_ptr& mdsmap_, Messenger *msgr, @@ -3622,9 +3622,9 @@ bool MDSRank::evict_client(int64_t session_id, MDSRankDispatcher::MDSRankDispatcher( mds_rank_t whoami_, - ceph::mutex &mds_lock_, + ceph::fair_mutex &mds_lock_, LogChannelRef &clog_, - SafeTimer &timer_, + CommonSafeTimer &timer_, Beacon &beacon_, std::unique_ptr &mdsmap_, Messenger *msgr, diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index acdf11ee4c048..6ba2001932867 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -22,7 +22,9 @@ #include "common/DecayCounter.h" #include "common/LogClient.h" #include "common/Timer.h" +#include "common/fair_mutex.h" #include "common/TrackedOp.h" +#include "common/ceph_mutex.h" #include "include/common_fwd.h" @@ -164,9 +166,9 @@ class MDSRank { MDSRank( mds_rank_t whoami_, - ceph::mutex &mds_lock_, + ceph::fair_mutex &mds_lock_, LogChannelRef &clog_, - SafeTimer &timer_, + CommonSafeTimer &timer_, Beacon &beacon_, std::unique_ptr & mdsmap_, Messenger *msgr, @@ -367,7 +369,7 @@ class MDSRank { // Reference to global MDS::mds_lock, so that users of MDSRank don't // carry around references to the outer MDS, and we can substitute // a separate lock here in future potentially. - ceph::mutex &mds_lock; + ceph::fair_mutex &mds_lock; // Reference to global cluster log client, just to avoid initialising // a separate one here. @@ -376,7 +378,7 @@ class MDSRank { // Reference to global timer utility, because MDSRank and MDSDaemon // currently both use the same mds_lock, so it makes sense for them // to share a timer. - SafeTimer &timer; + CommonSafeTimer &timer; std::unique_ptr &mdsmap; /* MDSDaemon::mdsmap */ @@ -429,7 +431,7 @@ class MDSRank { void signal() {cond.notify_all();} private: MDSRank *mds; - ceph::condition_variable cond; + std::condition_variable_any cond; } progress_thread; class C_MDS_StandbyReplayRestart; @@ -657,9 +659,9 @@ class MDSRankDispatcher : public MDSRank, public md_config_obs_t public: MDSRankDispatcher( mds_rank_t whoami_, - ceph::mutex &mds_lock_, + ceph::fair_mutex &mds_lock_, LogChannelRef &clog_, - SafeTimer &timer_, + CommonSafeTimer &timer_, Beacon &beacon_, std::unique_ptr &mdsmap_, Messenger *msgr, -- 2.39.5