From f8f95ab5c4601d7413d2f49b7d7d681e0cefd786 Mon Sep 17 00:00:00 2001 From: "J. Eric Ivancich" Date: Tue, 21 May 2024 14:06:47 -0400 Subject: [PATCH] rgw: provide testing support to dynamic resharding with reduction Adds a config option rgw_reshard_debug_interval that will allow us to make the resharding algorithms run on a faster schedule by allowing one day to be simulated by a set number of seconds. Signed-off-by: J. Eric Ivancich --- src/common/options/rgw.yaml.in | 17 ++++++++++ src/rgw/driver/rados/rgw_reshard.cc | 51 ++++++++++++++++++++++++----- src/rgw/rgw_quota.cc | 29 +++++++++++++--- 3 files changed, 84 insertions(+), 13 deletions(-) diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in index d34a60ba4d7..687431986c8 100644 --- a/src/common/options/rgw.yaml.in +++ b/src/common/options/rgw.yaml.in @@ -3301,6 +3301,23 @@ options: services: - rgw min: 10 +- name: rgw_reshard_debug_interval + type: int + level: dev + desc: The number of seconds that simulate one "day" in order to debug RGW dynamic resharding. + Do *not* modify for a production cluster. + long_desc: For debugging RGW dynamic resharding, the number of seconds that are equivalent to + one simulated "day". Values less than 1 are ignored and do not change dynamic resharding behavior. + For example, during debugging if one wanted every 10 minutes to be equivalent to one day, + then this would be set to 600, the number of seconds in 10 minutes. + default: -1 + services: + - rgw + with_legacy: true + see_also: + - rgw_dynamic_resharding + - rgw_reshard_thread_interval + - rgw_dynamic_resharding_reduction_wait - name: rgw_cache_expiry_interval type: uint level: advanced diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc index b6b5ca3c4ad..d57821151fe 100644 --- a/src/rgw/driver/rados/rgw_reshard.cc +++ b/src/rgw/driver/rados/rgw_reshard.cc @@ -1398,15 +1398,21 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry, ceph::real_time when_queued = entry.time; ceph::real_time now = real_clock::now(); - // convert hours to seconds - const uint32_t reshard_reduction_wait_period_hours = + // use double so we can handle fractions + double reshard_reduction_wait_hours = uint32_t(store->ctx()->_conf.get_val("rgw_dynamic_resharding_reduction_wait")); - auto timespan = - ceph::make_timespan(reshard_reduction_wait_period_hours * 60 * 60); - // if (now < when_queued + reshard_reduction_wait_period) { + // see if we have to reduce the waiting interval due to debug + // config + int debug_interval = store->ctx()->_conf.get_val("rgw_reshard_debug_interval"); + if (debug_interval >= 1) { + constexpr int secs_per_day = 60 * 60 * 24; + reshard_reduction_wait_hours = reshard_reduction_wait_hours * debug_interval / secs_per_day; + } + + auto timespan = std::chrono::seconds(int(60 * 60 * reshard_reduction_wait_hours)); if (now < when_queued + timespan) { - // skip for now + // too early to reshard; log and skip ldpp_dout(dpp, 20) << __func__ << ": INFO: reshard reduction for bucket \"" << entry.bucket_name << "\" will not proceed until " << @@ -1415,6 +1421,17 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry, return 0; } + // only if we allow the resharding logic to continue should we log + // the fact that the reduction_wait_time was shortened due to + // debugging mode + if (debug_interval >= 1) { + ldpp_dout(dpp, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " << + debug_interval << " the rgw_dynamic_resharding_reduction_wait is now " << + reshard_reduction_wait_hours << " hours (" << + int(reshard_reduction_wait_hours * 60 * 60) << " seconds) and bucket \"" << + entry.bucket_name << "\" has reached the reduction wait period" << dendl; + } + // all checks passed; we can drop through and proceed } @@ -1544,6 +1561,17 @@ void RGWReshard::stop_processor() } void *RGWReshard::ReshardWorker::entry() { + const auto debug_interval = cct->_conf.get_val("rgw_reshard_debug_interval"); + double interval_factor = 1.0; + if (debug_interval >= 1) { + constexpr double secs_per_day = 60 * 60 * 24; + interval_factor = debug_interval / secs_per_day; + + ldpp_dout(this, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " << + debug_interval << " the rgw_reshard_thread_interval will be " + "multiplied by a factor of " << interval_factor << dendl; + } + do { utime_t start = ceph_clock_now(); reshard->process_all_logshards(this, null_yield); @@ -1552,14 +1580,19 @@ void *RGWReshard::ReshardWorker::entry() { break; utime_t end = ceph_clock_now(); - end -= start; + utime_t elapsed = end - start; + int secs = cct->_conf.get_val("rgw_reshard_thread_interval"); + secs = std::max(1, int(secs * interval_factor)); - if (secs <= end.sec()) + if (secs <= elapsed.sec()) { continue; // next round + } - secs -= end.sec(); + secs -= elapsed.sec(); + // note: this will likely wait for the intended period of + // time, but could wait for less std::unique_lock locker{lock}; cond.wait_for(locker, std::chrono::seconds(secs)); } while (!reshard->going_down()); diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc index af7d9734627..377e8c74701 100644 --- a/src/rgw/rgw_quota.cc +++ b/src/rgw/rgw_quota.cc @@ -358,6 +358,21 @@ class RGWOwnerStatsCache : public RGWQuotaCache { void *entry() override { ldout(cct, 20) << "BucketsSyncThread: start" << dendl; + + // rgw_reshard_debug_interval is a DEV level configuration + // option, so we can assume it won't change while the RGW server + // is running, so we'll handle it once before we loop + double sync_interval_factor = 1.0; + const uint64_t debug_interval = cct->_conf->rgw_reshard_debug_interval; + if (debug_interval >= 1) { + constexpr double secs_per_day = 60 * 60 * 24; + sync_interval_factor = debug_interval / secs_per_day; + + ldout(cct, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " << + debug_interval << " the rgw_user_quota_bucket_sync_interval will be " + "multiplied by a factor of " << sync_interval_factor << dendl; + } + do { map buckets; @@ -372,14 +387,20 @@ class RGWOwnerStatsCache : public RGWQuotaCache { } } - if (stats->going_down()) + if (stats->going_down()) { break; + } + + uint64_t wait_secs = cct->_conf->rgw_user_quota_bucket_sync_interval; + wait_secs = std::max(uint64_t(1), + uint64_t(wait_secs * sync_interval_factor)); + // note: this will likely wait for the intended period of + // time, but could wait for less std::unique_lock locker{lock}; - cond.wait_for( - locker, - std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval)); + cond.wait_for(locker, std::chrono::seconds(wait_secs)); } while (!stats->going_down()); + ldout(cct, 20) << "BucketsSyncThread: done" << dendl; return NULL; -- 2.39.5