From: Patrick Donnelly Date: Wed, 21 Feb 2024 15:03:14 +0000 (-0500) Subject: mds: add counter to throttle quiesce X-Git-Tag: v20.0.0~2328^2~28 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f4eec857fc8bda4980b9046cde6314fa005122c2;p=ceph.git mds: add counter to throttle quiesce So a storm of quiesce operations do not affect normal MDS operations. Signed-off-by: Patrick Donnelly --- diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 6b7ef89080a0..d25e7b52edb0 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -163,6 +163,33 @@ options: - mds flags: - runtime +- name: mds_cache_quiesce_decay_rate + type: float + level: advanced + desc: decay rate for quiescing inodes throttle + default: 1 + services: + - mds + flags: + - runtime +- name: mds_cache_quiesce_threshold + type: size + level: advanced + desc: threshold for number of inodes that can be quiesced + default: 512_K + services: + - mds + flags: + - runtime +- name: mds_cache_quiesce_sleep + type: millisecs + level: advanced + desc: sleep time for request after passing quiesce threshold + default: 200 + services: + - mds + flags: + - runtime - name: mds_max_file_recover type: uint level: advanced diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 2d2b0273e2c4..8a3dd21d57be 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -122,7 +122,10 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : filer(m->objecter, m->finisher), stray_manager(m, purge_queue_), recovery_queue(m), - trim_counter(g_conf().get_val("mds_cache_trim_decay_rate")) + trim_counter(g_conf().get_val("mds_cache_trim_decay_rate")), + quiesce_counter(g_conf().get_val("mds_cache_trim_decay_rate")), + quiesce_threshold(g_conf().get_val("mds_cache_quiesce_threshold")), + quiesce_sleep(g_conf().get_val("mds_cache_quiesce_sleep")) { migrator.reset(new Migrator(mds, this)); @@ -195,6 +198,15 @@ void MDCache::handle_conf_change(const std::set& changed, const MDS cache_health_threshold = g_conf().get_val("mds_health_cache_threshold"); if (changed.count("mds_cache_mid")) lru.lru_set_midpoint(g_conf().get_val("mds_cache_mid")); + if (changed.count("mds_cache_quiesce_decay_rate")) { + quiesce_counter = DecayCounter(g_conf().get_val("mds_cache_quiesce_decay_rate")); + } + if (changed.count("mds_cache_quiesce_threshold")) { + quiesce_threshold = g_conf().get_val("mds_cache_quiesce_threshold"); + } + if (changed.count("mds_cache_quiesce_sleep")) { + quiesce_sleep = g_conf().get_val("mds_cache_quiesce_sleep"); + } if (changed.count("mds_cache_trim_decay_rate")) { trim_counter = DecayCounter(g_conf().get_val("mds_cache_trim_decay_rate")); } @@ -13530,6 +13542,15 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) dout(20) << __func__ << " " << *mdr << " quiescing " << *in << dendl; + if (quiesce_counter.get() > quiesce_threshold) { + dout(20) << __func__ + << " quiesce counter " << quiesce_counter + << " threshold (" << quiesce_threshold + << ") reached: scheduling retry" << dendl; + mds->timer.add_event_after(quiesce_sleep, new C_MDS_RetryRequest(this, mdr)); + return; + } + quiesce_counter.hit(); { /* Acquire authpins on `in` to prevent migrations after this rank considers diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index ba91a24e1451..ab34d238eafb 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -1489,6 +1489,9 @@ private: uint64_t kill_shutdown_at = 0; std::map quiesced_subvolumes; + DecayCounter quiesce_counter; + uint64_t quiesce_threshold; + std::chrono::milliseconds quiesce_sleep; }; class C_MDS_RetryRequest : public MDSInternalContext { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 209de7cb86e3..c9e50d79fa20 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -4020,6 +4020,9 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_cache_memory_limit", "mds_cache_mid", "mds_cache_reservation", + "mds_cache_quiesce_decay_rate", + "mds_cache_quiesce_threshold", + "mds_cache_quiesce_sleep", "mds_cache_trim_decay_rate", "mds_cap_acquisition_throttle_retry_request_time", "mds_cap_revoke_eviction_timeout",