From: Oguzhan Ozmen Date: Tue, 28 Apr 2026 00:09:16 +0000 (+0000) Subject: rgw/multisite: fix uninitialized LatencyMonitor average and use exponentially weighte... X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=824514e49a4acc505b23a4ce18921f5ad0fa46d9;p=ceph.git rgw/multisite: fix uninitialized LatencyMonitor average and use exponentially weighted moving average LatencyMonitor::total was declared without an initializer. Since std::chrono::duration's default constructor leaves the value indeterminate, the very first add_latency() call adds a real sample to garbage, producing a huge average that immediately triggers the "OSD cluster is overloaded" warning within seconds of RGW startup, before any actual slow ops occur. Additionally, the old implementation uses a naive lifetime average (total/count) that could slow the recovery from a transient slow-ops episode. Once poisoned, the average stayed high for a long time, keeping the throttling sync concurrency to 1. So, also replace the naive lifetime average in LatencyMonitor with an exponentially weighted moving average (alpha=0.15). With the weighted average, after a series of normal lock operations a past spike's influence decays faster, allowing concurrency to recover without an RGW restart. Fixes: https://tracker.ceph.com/issues/76308 Signed-off-by: Oguzhan Ozmen --- diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h index 1aa9f17e335f..481a7283e5ad 100644 --- a/src/rgw/driver/rados/rgw_cr_rados.h +++ b/src/rgw/driver/rados/rgw_cr_rados.h @@ -1523,20 +1523,30 @@ public: /// \warning This class is not thread safe. We do not use a mutex /// because all coroutines spawned by RGWDataSyncCR share a single thread. class LatencyMonitor { - ceph::timespan total; - std::uint64_t count = 0; + ceph::timespan avg{ceph::timespan::zero()}; + bool initialized = false; + // Weight for new samples in running average. Recent samples matter + // most; after ~20 new samples a past spike decays to <4%. + // Example: if avg is poisoned at 30s but real latency is 0.1s, + // after 20 good samples the avg drops to ~1.2s (fully recovered). + static constexpr double alpha = 0.15; public: LatencyMonitor() = default; void add_latency(ceph::timespan latency) { - total += latency; - ++count; + if (!initialized) { + avg = latency; + initialized = true; + } else { + avg = ceph::timespan( + static_cast( + alpha * latency.count() + (1.0 - alpha) * avg.count())); + } } ceph::timespan avg_latency() { - using namespace std::literals; - return count == 0 ? 0s : total / count; + return avg; } };