]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw: reshard improvements 25003/head
authorJ. Eric Ivancich <ivancich@redhat.com>
Fri, 9 Nov 2018 00:40:48 +0000 (19:40 -0500)
committerJ. Eric Ivancich <ivancich@redhat.com>
Fri, 9 Nov 2018 15:21:56 +0000 (10:21 -0500)
Improve error log message when an expired reshard lock is renewed.

Add two new configurable options to manage resharding:
* rgw_reshard_batch_size : number of reshard entries to batch together
  before sending the operations to the CLS back-end.
* rgw_reshard_max_aio : maximum number of outstanding asynchronous i/o
  operations to allow at a time.

Alter rgw_reshard_bucket_lock duration default from 2 minutes to 6
minutes.

Add documentation, minimum values, tags, and service to a few rgw
reshard configuration options. Change some rgw_reshard_* options from
LEVEL_DEV to LEVEL_ADVANCED.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
src/common/options.cc
src/rgw/rgw_reshard.cc

index 2a090c4cac601e6b699daf8c535ae0c9583ac34f..8865ca42390d9263c5f4ce8fe576d6d744cfd668 100644 (file)
@@ -6261,13 +6261,32 @@ std::vector<Option> get_rgw_options() {
     .set_default(true)
     .set_description("Enable stats on bucket listing in Swift"),
 
-    Option("rgw_reshard_num_logs", Option::TYPE_INT, Option::LEVEL_DEV)
+    Option("rgw_reshard_num_logs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(16)
-    .set_description(""),
+    .set_min(1)
+    .set_description("")
+    .add_service("rgw"),
 
-    Option("rgw_reshard_bucket_lock_duration", Option::TYPE_INT, Option::LEVEL_DEV)
-    .set_default(120)
-    .set_description(""),
+    Option("rgw_reshard_bucket_lock_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(360)
+    .set_min(30)
+    .set_description("Number of seconds the timeout on the reshard locks (bucket reshard lock and reshard log lock) are set to. As a reshard proceeds these locks can be renewed/extended. If too short, reshards cannot complete and will fail, causing a future reshard attempt. If too long a hung or crashed reshard attempt will keep the bucket locked for an extended period, not allowing RGW to detect the failed reshard attempt and recover.")
+    .add_tag("performance")
+    .add_service("rgw"),
+    
+    Option("rgw_reshard_batch_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_min(8)
+    .set_description("Number of reshard entries to batch together before sending the operations to the CLS back-end")
+    .add_tag("performance")
+    .add_service("rgw"),
+
+    Option("rgw_reshard_max_aio", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_min(16)
+    .set_description("Maximum number of outstanding asynchronous I/O operations to allow at a time during resharding")
+    .add_tag("performance")
+    .add_service("rgw"),
 
     Option("rgw_trust_forwarded_https", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
@@ -6357,7 +6376,8 @@ std::vector<Option> get_rgw_options() {
 
     Option("rgw_reshard_thread_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(10_min)
-    .set_description(""),
+    .set_min(10_min)
+    .set_description("Number of seconds between processing of reshard log entries"),
 
     Option("rgw_cache_expiry_interval", Option::TYPE_UINT,
           Option::LEVEL_ADVANCED)
index a9f7d900e8b1a2ded09810cc319550fae42cf41b..4c84d8dff4be34b96200ce4f367c23fa592d4642 100644 (file)
@@ -2,6 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 
 #include <limits>
+#include <sstream>
 
 #include "rgw_rados.h"
 #include "rgw_bucket.h"
@@ -21,10 +22,6 @@ const string reshard_lock_name = "reshard_process";
 const string bucket_instance_lock_name = "bucket_instance_lock";
 
 
-#define RESHARD_SHARD_WINDOW 64
-#define RESHARD_MAX_AIO 128
-
-
 class BucketReshardShard {
   RGWRados *store;
   const RGWBucketInfo& bucket_info;
@@ -33,6 +30,8 @@ class BucketReshardShard {
   vector<rgw_cls_bi_entry> entries;
   map<uint8_t, rgw_bucket_category_stats> stats;
   deque<librados::AioCompletion *>& aio_completions;
+  uint64_t max_aio_completions;
+  uint64_t reshard_shard_batch_size;
 
   int wait_next_completion() {
     librados::AioCompletion *c = aio_completions.front();
@@ -52,7 +51,7 @@ class BucketReshardShard {
   }
 
   int get_completion(librados::AioCompletion **c) {
-    if (aio_completions.size() >= RESHARD_MAX_AIO) {
+    if (aio_completions.size() >= max_aio_completions) {
       int ret = wait_next_completion();
       if (ret < 0) {
         return ret;
@@ -74,6 +73,11 @@ public:
   {
     num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1);
     bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */);
+
+    max_aio_completions =
+      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
+    reshard_shard_batch_size =
+      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
   }
 
   int get_num_shard() {
@@ -90,7 +94,7 @@ public:
       target.total_size_rounded += entry_stats.total_size_rounded;
       target.actual_size += entry_stats.actual_size;
     }
-    if (entries.size() >= RESHARD_SHARD_WINDOW) {
+    if (entries.size() >= reshard_shard_batch_size) {
       int ret = flush();
       if (ret < 0) {
         return ret;
@@ -401,7 +405,8 @@ RGWBucketReshardLock::RGWBucketReshardLock(RGWRados* _store,
   ephemeral(_ephemeral),
   internal_lock(reshard_lock_name)
 {
-  const int lock_dur_secs = store->ctx()->_conf->rgw_reshard_bucket_lock_duration;
+  const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
+    "rgw_reshard_bucket_lock_duration");
   duration = std::chrono::seconds(lock_dur_secs);
 
 #define COOKIE_LEN 16
@@ -450,8 +455,14 @@ int RGWBucketReshardLock::renew(const Clock::time_point& now) {
     ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
   }
   if (ret < 0) { /* expired or already locked by another processor */
+    std::stringstream error_s;
+    if (-ENOENT == ret) {
+      error_s << "ENOENT (lock expired or never initially locked)";
+    } else {
+      error_s << ret << " (" << cpp_strerror(-ret) << ")";
+    }
     ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
-      lock_oid << " with " << cpp_strerror(-ret) << dendl;
+      lock_oid << " with error " << error_s.str() << dendl;
     return ret;
   }
   internal_lock.set_must_renew(false);
@@ -1093,7 +1104,7 @@ void *RGWReshard::ReshardWorker::entry() {
 
     utime_t end = ceph_clock_now();
     end -= start;
-    int secs = cct->_conf->rgw_reshard_thread_interval;
+    int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
 
     if (secs <= end.sec())
       continue; // next round