]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw/multisite: log concurrency state transitions in adj_concurrency
authorOguzhan Ozmen <oozmen@bloomberg.net>
Tue, 28 Apr 2026 19:44:02 +0000 (19:44 +0000)
committerOguzhan Ozmen <oozmen@bloomberg.net>
Wed, 29 Apr 2026 21:23:19 +0000 (21:23 +0000)
Replace the timer-based "OSD cluster is overloaded" warning with
state-transition logging. Also, log when concurrency is halved and
eventually recovered.

Signed-off-by: Oguzhan Ozmen <oozmen@bloomberg.net>
src/rgw/driver/rados/rgw_data_sync.h

index 00fbb2cc052261aa686be04928a9d0a3c4db7b2c..d26532f9971cf43877baf5e46d4801475f5b3205 100644 (file)
@@ -318,7 +318,10 @@ void pretty_print(const RGWDataSyncEnv* env, const S& fmt, T&& ...t) {
 /// down when latency rises.
 class LatencyConcurrencyControl : public LatencyMonitor {
   static constexpr auto dout_subsys = ceph_subsys_rgw;
-  ceph::coarse_mono_time last_warning;
+
+  enum class State { Normal, Throttled, Overloaded };
+  State state = State::Normal;
+
 public:
   CephContext* cct;
 
@@ -331,23 +334,42 @@ public:
   /// bucket), accept a number of concurrent operations to spawn and,
   /// if latency is high, cut it in half. If latency is really high,
   /// cut it to 1.
+  ///
+  /// State transitions are logged once so users can see when
+  /// concurrency is reduced and when it recovers.
   int64_t adj_concurrency(int64_t concurrency) {
     using namespace std::literals;
     auto threshold = (cct->_conf->rgw_sync_lease_period * 1s) / 12;
 
     if (avg_latency() >= 2 * threshold) [[unlikely]] {
-      auto now = ceph::coarse_mono_clock::now();
-      if (now - last_warning > 5min) {
+      if (state != State::Overloaded) [[unlikely]] {
         ldout(cct, -1)
-            << "WARNING: The OSD cluster is overloaded and struggling to "
-            << "complete ops. You need more capacity to serve this level "
-           << "of demand." << dendl;
-       last_warning = now;
+            << "WARNING: sync lock latency is critically high, reducing concurrency."
+            << " avg_latency_ms=" << std::chrono::duration_cast<std::chrono::milliseconds>(avg_latency()).count()
+            << " threshold_ms=" << std::chrono::duration_cast<std::chrono::milliseconds>(threshold).count()
+            << " concurrency=1" << dendl;
+        state = State::Overloaded;
       }
       return 1;
     } else if (avg_latency() >= threshold) [[unlikely]] {
+      if (state != State::Throttled) [[unlikely]] {
+        ldout(cct, -1)
+            << "WARNING: sync lock latency elevated, halving concurrency."
+            << " avg_latency_ms=" << std::chrono::duration_cast<std::chrono::milliseconds>(avg_latency()).count()
+            << " threshold_ms=" << std::chrono::duration_cast<std::chrono::milliseconds>(threshold).count()
+            << " concurrency=" << (concurrency / 2) << dendl;
+        state = State::Throttled;
+      }
       return concurrency / 2;
     } else [[likely]] {
+      if (state != State::Normal) [[unlikely]] {
+        ldout(cct, 1)
+            << "sync lock latency recovered, restoring full concurrency."
+            << " avg_latency_ms=" << std::chrono::duration_cast<std::chrono::milliseconds>(avg_latency()).count()
+            << " threshold_ms=" << std::chrono::duration_cast<std::chrono::milliseconds>(threshold).count()
+            << " concurrency=" << concurrency << dendl;
+        state = State::Normal;
+      }
       return concurrency;
     }
   }