From: Oguzhan Ozmen Date: Tue, 28 Apr 2026 19:44:02 +0000 (+0000) Subject: rgw/multisite: log concurrency state transitions in adj_concurrency X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1d52ad327538d18b19e27fcd7e339a11e9d333c3;p=ceph.git rgw/multisite: log concurrency state transitions in adj_concurrency Replace the timer-based "OSD cluster is overloaded" warning with state-transition logging. Also, log when concurrency is halved and eventually recovered. Signed-off-by: Oguzhan Ozmen --- diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h index 00fbb2cc0522..d26532f9971c 100644 --- a/src/rgw/driver/rados/rgw_data_sync.h +++ b/src/rgw/driver/rados/rgw_data_sync.h @@ -318,7 +318,10 @@ void pretty_print(const RGWDataSyncEnv* env, const S& fmt, T&& ...t) { /// down when latency rises. class LatencyConcurrencyControl : public LatencyMonitor { static constexpr auto dout_subsys = ceph_subsys_rgw; - ceph::coarse_mono_time last_warning; + + enum class State { Normal, Throttled, Overloaded }; + State state = State::Normal; + public: CephContext* cct; @@ -331,23 +334,42 @@ public: /// bucket), accept a number of concurrent operations to spawn and, /// if latency is high, cut it in half. If latency is really high, /// cut it to 1. + /// + /// State transitions are logged once so users can see when + /// concurrency is reduced and when it recovers. int64_t adj_concurrency(int64_t concurrency) { using namespace std::literals; auto threshold = (cct->_conf->rgw_sync_lease_period * 1s) / 12; if (avg_latency() >= 2 * threshold) [[unlikely]] { - auto now = ceph::coarse_mono_clock::now(); - if (now - last_warning > 5min) { + if (state != State::Overloaded) [[unlikely]] { ldout(cct, -1) - << "WARNING: The OSD cluster is overloaded and struggling to " - << "complete ops. You need more capacity to serve this level " - << "of demand." << dendl; - last_warning = now; + << "WARNING: sync lock latency is critically high, reducing concurrency." + << " avg_latency_ms=" << std::chrono::duration_cast(avg_latency()).count() + << " threshold_ms=" << std::chrono::duration_cast(threshold).count() + << " concurrency=1" << dendl; + state = State::Overloaded; } return 1; } else if (avg_latency() >= threshold) [[unlikely]] { + if (state != State::Throttled) [[unlikely]] { + ldout(cct, -1) + << "WARNING: sync lock latency elevated, halving concurrency." + << " avg_latency_ms=" << std::chrono::duration_cast(avg_latency()).count() + << " threshold_ms=" << std::chrono::duration_cast(threshold).count() + << " concurrency=" << (concurrency / 2) << dendl; + state = State::Throttled; + } return concurrency / 2; } else [[likely]] { + if (state != State::Normal) [[unlikely]] { + ldout(cct, 1) + << "sync lock latency recovered, restoring full concurrency." + << " avg_latency_ms=" << std::chrono::duration_cast(avg_latency()).count() + << " threshold_ms=" << std::chrono::duration_cast(threshold).count() + << " concurrency=" << concurrency << dendl; + state = State::Normal; + } return concurrency; } }