rgw: provide testing support to dynamic resharding with reduction

author J. Eric Ivancich <ivancich@redhat.com>

Tue, 21 May 2024 18:06:47 +0000 (14:06 -0400)

committer J. Eric Ivancich <ivancich@redhat.com>

Fri, 31 May 2024 21:18:04 +0000 (17:18 -0400)
author J. Eric Ivancich <ivancich@redhat.com>
Tue, 21 May 2024 18:06:47 +0000 (14:06 -0400)
committer J. Eric Ivancich <ivancich@redhat.com>
Fri, 31 May 2024 21:18:04 +0000 (17:18 -0400)
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in

index d34a60ba4d7cdf42e358c4871a1a63d2d8e273d1..687431986c88380bb9e725b0189f75be4304c14e 100644 (file)
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -3301,6 +3301,23 @@ options:
    services:
    - rgw
    min: 10
+- name: rgw_reshard_debug_interval
+  type: int
+  level: dev
+  desc: The number of seconds that simulate one "day" in order to debug RGW dynamic resharding.
+    Do *not* modify for a production cluster.
+  long_desc: For debugging RGW dynamic resharding, the number of seconds that are equivalent to
+    one simulated "day". Values less than 1 are ignored and do not change dynamic resharding behavior.
+    For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
+    then this would be set to 600, the number of seconds in 10 minutes.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+  see_also:
+  - rgw_dynamic_resharding
+  - rgw_reshard_thread_interval
+  - rgw_dynamic_resharding_reduction_wait
  - name: rgw_cache_expiry_interval
    type: uint
    level: advanced
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc

index b6b5ca3c4ad61e6f3cb3f7c9dab56885487c52a6..d57821151fee69a9da20dde1120258c9a3d771e8 100644 (file)
--- a/src/rgw/driver/rados/rgw_reshard.cc
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -1398,15 +1398,21 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
      ceph::real_time when_queued = entry.time;
      ceph::real_time now = real_clock::now();
  
-    // convert hours to seconds
-    const uint32_t reshard_reduction_wait_period_hours =
+    // use double so we can handle fractions
+    double reshard_reduction_wait_hours =
        uint32_t(store->ctx()->_conf.get_val<uint64_t>("rgw_dynamic_resharding_reduction_wait"));
  
-    auto timespan =
-      ceph::make_timespan(reshard_reduction_wait_period_hours * 60 * 60);
-    // if (now < when_queued + reshard_reduction_wait_period) {
+    // see if we have to reduce the waiting interval due to debug
+    // config
+    int debug_interval = store->ctx()->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
+    if (debug_interval >= 1) {
+      constexpr int secs_per_day = 60 * 60 * 24;
+      reshard_reduction_wait_hours = reshard_reduction_wait_hours * debug_interval / secs_per_day;
+    }
+
+    auto timespan = std::chrono::seconds(int(60 * 60 * reshard_reduction_wait_hours));
      if (now < when_queued + timespan) {
-      // skip for now
+      // too early to reshard; log and skip
        ldpp_dout(dpp, 20) <<  __func__ <<
         ": INFO: reshard reduction for bucket \"" <<
         entry.bucket_name << "\" will not proceed until " <<
@@ -1415,6 +1421,17 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
        return 0;
      }
  
+    // only if we allow the resharding logic to continue should we log
+    // the fact that the reduction_wait_time was shortened due to
+    // debugging mode
+    if (debug_interval >= 1) {
+      ldpp_dout(dpp, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+       debug_interval << " the rgw_dynamic_resharding_reduction_wait is now " <<
+       reshard_reduction_wait_hours << " hours (" <<
+       int(reshard_reduction_wait_hours * 60 * 60) << " seconds) and bucket \"" <<
+       entry.bucket_name << "\" has reached the reduction wait period" << dendl;
+    }
+
      // all checks passed; we can drop through and proceed
    }
  
@@ -1544,6 +1561,17 @@ void RGWReshard::stop_processor()
  }
  
  void *RGWReshard::ReshardWorker::entry() {
+  const auto debug_interval = cct->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
+  double interval_factor = 1.0;
+  if (debug_interval >= 1) {
+    constexpr double secs_per_day = 60 * 60 * 24;
+    interval_factor = debug_interval / secs_per_day;
+
+    ldpp_dout(this, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+      debug_interval << " the rgw_reshard_thread_interval will be "
+      "multiplied by a factor of " << interval_factor << dendl;
+  }
+
    do {
      utime_t start = ceph_clock_now();
      reshard->process_all_logshards(this, null_yield);
@@ -1552,14 +1580,19 @@ void *RGWReshard::ReshardWorker::entry() {
        break;
  
      utime_t end = ceph_clock_now();
-    end -= start;
+    utime_t elapsed = end - start;
+
      int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+    secs = std::max(1, int(secs * interval_factor));
  
-    if (secs <= end.sec())
+    if (secs <= elapsed.sec()) {
        continue; // next round
+    }
  
-    secs -= end.sec();
+    secs -= elapsed.sec();
  
+    // note: this will likely wait for the intended period of
+    // time, but could wait for less
      std::unique_lock locker{lock};
      cond.wait_for(locker, std::chrono::seconds(secs));
    } while (!reshard->going_down());
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc

index af7d973462759410b9ffc4c816944b6027af5568..377e8c7470147e66eab18d12baa3920d3733c7cd 100644 (file)
--- a/src/rgw/rgw_quota.cc
+++ b/src/rgw/rgw_quota.cc
@@ -358,6 +358,21 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
  
      void *entry() override {
        ldout(cct, 20) << "BucketsSyncThread: start" << dendl;
+
+      // rgw_reshard_debug_interval is a DEV level configuration
+      // option, so we can assume it won't change while the RGW server
+      // is running, so we'll handle it once before we loop
+      double sync_interval_factor = 1.0;
+      const uint64_t debug_interval = cct->_conf->rgw_reshard_debug_interval;
+      if (debug_interval >= 1) {
+         constexpr double secs_per_day = 60 * 60 * 24;
+         sync_interval_factor = debug_interval / secs_per_day;
+
+         ldout(cct, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+           debug_interval << " the rgw_user_quota_bucket_sync_interval will be "
+           "multiplied by a factor of " << sync_interval_factor << dendl;
+      }
+
        do {
          map<rgw_bucket, rgw_owner> buckets;
  
@@ -372,14 +387,20 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
            }
          }
  
-        if (stats->going_down())
+        if (stats->going_down()) {
            break;
+       }
+
+       uint64_t wait_secs = cct->_conf->rgw_user_quota_bucket_sync_interval;
+       wait_secs = std::max(uint64_t(1),
+                            uint64_t(wait_secs * sync_interval_factor));
  
+       // note: this will likely wait for the intended period of
+       // time, but could wait for less
         std::unique_lock locker{lock};
-       cond.wait_for(
-          locker,
-          std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval));
+       cond.wait_for(locker, std::chrono::seconds(wait_secs));
        } while (!stats->going_down());
+
        ldout(cct, 20) << "BucketsSyncThread: done" << dendl;
  
        return NULL;
author	J. Eric Ivancich <ivancich@redhat.com>
	Tue, 21 May 2024 18:06:47 +0000 (14:06 -0400)
committer	J. Eric Ivancich <ivancich@redhat.com>
	Fri, 31 May 2024 21:18:04 +0000 (17:18 -0400)
src/common/options/rgw.yaml.in		patch \| blob \| history
src/rgw/driver/rados/rgw_reshard.cc		patch \| blob \| history
src/rgw/rgw_quota.cc		patch \| blob \| history