]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw: provide testing support to dynamic resharding with reduction
authorJ. Eric Ivancich <ivancich@redhat.com>
Tue, 21 May 2024 18:06:47 +0000 (14:06 -0400)
committerJ. Eric Ivancich <ivancich@redhat.com>
Fri, 31 May 2024 21:18:04 +0000 (17:18 -0400)
Adds a config option rgw_reshard_debug_interval that will allow us to
make the resharding algorithms run on a faster schedule by allowing
one day to be simulated by a set number of seconds.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
src/common/options/rgw.yaml.in
src/rgw/driver/rados/rgw_reshard.cc
src/rgw/rgw_quota.cc

index d34a60ba4d7cdf42e358c4871a1a63d2d8e273d1..687431986c88380bb9e725b0189f75be4304c14e 100644 (file)
@@ -3301,6 +3301,23 @@ options:
   services:
   - rgw
   min: 10
+- name: rgw_reshard_debug_interval
+  type: int
+  level: dev
+  desc: The number of seconds that simulate one "day" in order to debug RGW dynamic resharding.
+    Do *not* modify for a production cluster.
+  long_desc: For debugging RGW dynamic resharding, the number of seconds that are equivalent to
+    one simulated "day". Values less than 1 are ignored and do not change dynamic resharding behavior.
+    For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
+    then this would be set to 600, the number of seconds in 10 minutes.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+  see_also:
+  - rgw_dynamic_resharding
+  - rgw_reshard_thread_interval
+  - rgw_dynamic_resharding_reduction_wait
 - name: rgw_cache_expiry_interval
   type: uint
   level: advanced
index b6b5ca3c4ad61e6f3cb3f7c9dab56885487c52a6..d57821151fee69a9da20dde1120258c9a3d771e8 100644 (file)
@@ -1398,15 +1398,21 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     ceph::real_time when_queued = entry.time;
     ceph::real_time now = real_clock::now();
 
-    // convert hours to seconds
-    const uint32_t reshard_reduction_wait_period_hours =
+    // use double so we can handle fractions
+    double reshard_reduction_wait_hours =
       uint32_t(store->ctx()->_conf.get_val<uint64_t>("rgw_dynamic_resharding_reduction_wait"));
 
-    auto timespan =
-      ceph::make_timespan(reshard_reduction_wait_period_hours * 60 * 60);
-    // if (now < when_queued + reshard_reduction_wait_period) {
+    // see if we have to reduce the waiting interval due to debug
+    // config
+    int debug_interval = store->ctx()->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
+    if (debug_interval >= 1) {
+      constexpr int secs_per_day = 60 * 60 * 24;
+      reshard_reduction_wait_hours = reshard_reduction_wait_hours * debug_interval / secs_per_day;
+    }
+
+    auto timespan = std::chrono::seconds(int(60 * 60 * reshard_reduction_wait_hours));
     if (now < when_queued + timespan) {
-      // skip for now
+      // too early to reshard; log and skip
       ldpp_dout(dpp, 20) <<  __func__ <<
        ": INFO: reshard reduction for bucket \"" <<
        entry.bucket_name << "\" will not proceed until " <<
@@ -1415,6 +1421,17 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
       return 0;
     }
 
+    // only if we allow the resharding logic to continue should we log
+    // the fact that the reduction_wait_time was shortened due to
+    // debugging mode
+    if (debug_interval >= 1) {
+      ldpp_dout(dpp, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+       debug_interval << " the rgw_dynamic_resharding_reduction_wait is now " <<
+       reshard_reduction_wait_hours << " hours (" <<
+       int(reshard_reduction_wait_hours * 60 * 60) << " seconds) and bucket \"" <<
+       entry.bucket_name << "\" has reached the reduction wait period" << dendl;
+    }
+
     // all checks passed; we can drop through and proceed
   }
 
@@ -1544,6 +1561,17 @@ void RGWReshard::stop_processor()
 }
 
 void *RGWReshard::ReshardWorker::entry() {
+  const auto debug_interval = cct->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
+  double interval_factor = 1.0;
+  if (debug_interval >= 1) {
+    constexpr double secs_per_day = 60 * 60 * 24;
+    interval_factor = debug_interval / secs_per_day;
+
+    ldpp_dout(this, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+      debug_interval << " the rgw_reshard_thread_interval will be "
+      "multiplied by a factor of " << interval_factor << dendl;
+  }
+
   do {
     utime_t start = ceph_clock_now();
     reshard->process_all_logshards(this, null_yield);
@@ -1552,14 +1580,19 @@ void *RGWReshard::ReshardWorker::entry() {
       break;
 
     utime_t end = ceph_clock_now();
-    end -= start;
+    utime_t elapsed = end - start;
+
     int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+    secs = std::max(1, int(secs * interval_factor));
 
-    if (secs <= end.sec())
+    if (secs <= elapsed.sec()) {
       continue; // next round
+    }
 
-    secs -= end.sec();
+    secs -= elapsed.sec();
 
+    // note: this will likely wait for the intended period of
+    // time, but could wait for less
     std::unique_lock locker{lock};
     cond.wait_for(locker, std::chrono::seconds(secs));
   } while (!reshard->going_down());
index af7d973462759410b9ffc4c816944b6027af5568..377e8c7470147e66eab18d12baa3920d3733c7cd 100644 (file)
@@ -358,6 +358,21 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
 
     void *entry() override {
       ldout(cct, 20) << "BucketsSyncThread: start" << dendl;
+
+      // rgw_reshard_debug_interval is a DEV level configuration
+      // option, so we can assume it won't change while the RGW server
+      // is running, so we'll handle it once before we loop
+      double sync_interval_factor = 1.0;
+      const uint64_t debug_interval = cct->_conf->rgw_reshard_debug_interval;
+      if (debug_interval >= 1) {
+         constexpr double secs_per_day = 60 * 60 * 24;
+         sync_interval_factor = debug_interval / secs_per_day;
+
+         ldout(cct, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
+           debug_interval << " the rgw_user_quota_bucket_sync_interval will be "
+           "multiplied by a factor of " << sync_interval_factor << dendl;
+      }
+
       do {
         map<rgw_bucket, rgw_owner> buckets;
 
@@ -372,14 +387,20 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
           }
         }
 
-        if (stats->going_down())
+        if (stats->going_down()) {
           break;
+       }
+
+       uint64_t wait_secs = cct->_conf->rgw_user_quota_bucket_sync_interval;
+       wait_secs = std::max(uint64_t(1),
+                            uint64_t(wait_secs * sync_interval_factor));
 
+       // note: this will likely wait for the intended period of
+       // time, but could wait for less
        std::unique_lock locker{lock};
-       cond.wait_for(
-          locker,
-          std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval));
+       cond.wait_for(locker, std::chrono::seconds(wait_secs));
       } while (!stats->going_down());
+
       ldout(cct, 20) << "BucketsSyncThread: done" << dendl;
 
       return NULL;