From f8f95ab5c4601d7413d2f49b7d7d681e0cefd786 Mon Sep 17 00:00:00 2001
From: "J. Eric Ivancich" <ivancich@redhat.com>
Date: Tue, 21 May 2024 14:06:47 -0400
Subject: [PATCH] rgw: provide testing support to dynamic resharding with
 reduction

Adds a config option rgw_reshard_debug_interval that will allow us to
make the resharding algorithms run on a faster schedule by allowing
one day to be simulated by a set number of seconds.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
---
 src/common/options/rgw.yaml.in      | 17 ++++++++++
 src/rgw/driver/rados/rgw_reshard.cc | 51 ++++++++++++++++++++++++-----
 src/rgw/rgw_quota.cc                | 29 +++++++++++++---
 3 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
index d34a60ba4d7..687431986c8 100644
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -3301,6 +3301,23 @@ options:
   services:
   - rgw
   min: 10
+- name: rgw_reshard_debug_interval
+  type: int
+  level: dev
+  desc: The number of seconds that simulate one "day" in order to debug RGW dynamic resharding.
+    Do *not* modify for a production cluster.
+  long_desc: For debugging RGW dynamic resharding, the number of seconds that are equivalent to
+    one simulated "day". Values less than 1 are ignored and do not change dynamic resharding behavior.
+    For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
+    then this would be set to 600, the number of seconds in 10 minutes.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+  see_also:
+  - rgw_dynamic_resharding
+  - rgw_reshard_thread_interval
+  - rgw_dynamic_resharding_reduction_wait
 - name: rgw_cache_expiry_interval
   type: uint
   level: advanced
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc
index b6b5ca3c4ad..d57821151fe 100644
--- a/src/rgw/driver/rados/rgw_reshard.cc
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -1398,15 +1398,21 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     ceph::real_time when_queued = entry.time;
     ceph::real_time now = real_clock::now();
 
-    // convert hours to seconds
-    const uint32_t reshard_reduction_wait_period_hours =
+    // use double so we can handle fractions
+    double reshard_reduction_wait_hours =
       uint32_t(store->ctx()->_conf.get_val<uint64_t>("rgw_dynamic_resharding_reduction_wait"));
 
-    auto timespan =
-      ceph::make_timespan(reshard_reduction_wait_period_hours * 60 * 60);
-    // if (now < when_queued + reshard_reduction_wait_period) {
+    // see if we have to reduce the waiting interval due to debug
+    // config
+    int debug_interval = store->ctx()->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
+    if (debug_interval >= 1) {
+      constexpr int secs_per_day = 60 * 60 * 24;
+      reshard_reduction_wait_hours = reshard_reduction_wait_hours * debug_interval / secs_per_day;
+    }
+
+    auto timespan = std::chrono::seconds(int(60 * 60 * reshard_reduction_wait_hours));
     if (now < when_queued + timespan) {
-      // skip for now
+      // too early to reshard; log and skip
       ldpp_dout(dpp, 20) <<  __func__ <<
 	": INFO: reshard reduction for bucket \"" <<
 	entry.bucket_name << "\" will not proceed until " <<
@@ -1415,6 +1421,17 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
       return 0;
     }
 
+    // only if we allow the resharding logic to continue should we log
+    // the fact that the reduction_wait_time was shortened due to
+    // debugging mode
+    if (debug_interval >= 1) {
+      ldpp_dout(dpp, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+	debug_interval << " the rgw_dynamic_resharding_reduction_wait is now " <<
+	reshard_reduction_wait_hours << " hours (" <<
+	int(reshard_reduction_wait_hours * 60 * 60) << " seconds) and bucket \"" <<
+	entry.bucket_name << "\" has reached the reduction wait period" << dendl;
+    }
+
     // all checks passed; we can drop through and proceed
   }
 
@@ -1544,6 +1561,17 @@ void RGWReshard::stop_processor()
 }
 
 void *RGWReshard::ReshardWorker::entry() {
+  const auto debug_interval = cct->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
+  double interval_factor = 1.0;
+  if (debug_interval >= 1) {
+    constexpr double secs_per_day = 60 * 60 * 24;
+    interval_factor = debug_interval / secs_per_day;
+
+    ldpp_dout(this, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+      debug_interval << " the rgw_reshard_thread_interval will be "
+      "multiplied by a factor of " << interval_factor << dendl;
+  }
+
   do {
     utime_t start = ceph_clock_now();
     reshard->process_all_logshards(this, null_yield);
@@ -1552,14 +1580,19 @@ void *RGWReshard::ReshardWorker::entry() {
       break;
 
     utime_t end = ceph_clock_now();
-    end -= start;
+    utime_t elapsed = end - start;
+
     int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+    secs = std::max(1, int(secs * interval_factor));
 
-    if (secs <= end.sec())
+    if (secs <= elapsed.sec()) {
       continue; // next round
+    }
 
-    secs -= end.sec();
+    secs -= elapsed.sec();
 
+    // note: this will likely wait for the intended period of
+    // time, but could wait for less
     std::unique_lock locker{lock};
     cond.wait_for(locker, std::chrono::seconds(secs));
   } while (!reshard->going_down());
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
index af7d9734627..377e8c74701 100644
--- a/src/rgw/rgw_quota.cc
+++ b/src/rgw/rgw_quota.cc
@@ -358,6 +358,21 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
 
     void *entry() override {
       ldout(cct, 20) << "BucketsSyncThread: start" << dendl;
+
+      // rgw_reshard_debug_interval is a DEV level configuration
+      // option, so we can assume it won't change while the RGW server
+      // is running, so we'll handle it once before we loop
+      double sync_interval_factor = 1.0;
+      const uint64_t debug_interval = cct->_conf->rgw_reshard_debug_interval;
+      if (debug_interval >= 1) {
+	  constexpr double secs_per_day = 60 * 60 * 24;
+	  sync_interval_factor = debug_interval / secs_per_day;
+
+	  ldout(cct, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+	    debug_interval << " the rgw_user_quota_bucket_sync_interval will be "
+	    "multiplied by a factor of " << sync_interval_factor << dendl;
+      }
+
       do {
         map<rgw_bucket, rgw_owner> buckets;
 
@@ -372,14 +387,20 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
           }
         }
 
-        if (stats->going_down())
+        if (stats->going_down()) {
           break;
+	}
+
+	uint64_t wait_secs = cct->_conf->rgw_user_quota_bucket_sync_interval;
+	wait_secs = std::max(uint64_t(1),
+			     uint64_t(wait_secs * sync_interval_factor));
 
+	// note: this will likely wait for the intended period of
+	// time, but could wait for less
 	std::unique_lock locker{lock};
-	cond.wait_for(
-          locker,
-          std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval));
+	cond.wait_for(locker, std::chrono::seconds(wait_secs));
       } while (!stats->going_down());
+
       ldout(cct, 20) << "BucketsSyncThread: done" << dendl;
 
       return NULL;
-- 
2.39.5