rgw: allow per-bucket minimum number of shards

author J. Eric Ivancich <ivancich@redhat.com>

Wed, 15 Jan 2025 16:26:59 +0000 (11:26 -0500)

committer J. Eric Ivancich <ivancich@redhat.com>

Fri, 28 Feb 2025 19:31:34 +0000 (14:31 -0500)
author J. Eric Ivancich <ivancich@redhat.com>
Wed, 15 Jan 2025 16:26:59 +0000 (11:26 -0500)
committer J. Eric Ivancich <ivancich@redhat.com>
Fri, 28 Feb 2025 19:31:34 +0000 (14:31 -0500)
diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc

index e460739ac9f6ff3723c562783f8084d0db54dd02..e444521d395c94c68f17eef8a1f360fbc9373e9f 100644 (file)
--- a/src/rgw/driver/rados/rgw_bucket.cc
+++ b/src/rgw/driver/rados/rgw_bucket.cc
@@ -2815,6 +2815,7 @@ void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
  
    if (shards) {
      layout.current_index.layout.normal.num_shards = *shards;
+    layout.current_index.layout.normal.min_num_shards = *shards;
    } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
      layout.current_index.layout.normal.num_shards =
        cct->_conf->rgw_override_bucket_index_max_shards;
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc

index 5eb2f9d0a075a51f33d680a4c042ad17952cc468..959e2f7d9943a9f67d72fe69e0e33be5cefa7bad 100644 (file)
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -10823,6 +10823,7 @@ int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBuck
  void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
                                           const uint64_t num_objs,
                                           const uint32_t num_source_shards,
+                                         const uint32_t min_layout_shards,
                                           bool& need_resharding,
                                           uint32_t* suggested_num_shards)
  {
@@ -10834,6 +10835,7 @@ void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
  
    RGWBucketReshard::calculate_preferred_shards(dpp,
                                                max_dynamic_shards,
+                                              min_layout_shards,
                                                max_objs_per_shard,
                                                is_multisite,
                                                num_objs,
@@ -10867,8 +10869,11 @@ int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
    uint32_t suggested_num_shards = 0;
    const uint32_t num_source_shards =
      rgw::current_num_shards(bucket_info.layout);
+  const uint32_t min_layout_shards =
+    rgw::current_min_layout_shards(bucket_info.layout);
  
-  calculate_preferred_shards(dpp, num_objs, num_source_shards,
+  calculate_preferred_shards(dpp, num_objs,
+                            num_source_shards, min_layout_shards,
                              need_resharding, &suggested_num_shards);
    if (! need_resharding) {
      return 0;
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h

index 868e5c95eacb62ef7499291712667f26950bb3d7..22bc71690d194cd2d15a4a344fd6d3cf46b20637 100644 (file)
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -1630,6 +1630,7 @@ public:
    void calculate_preferred_shards(const DoutPrefixProvider* dpp,
                                   const uint64_t num_objs,
                                   const uint32_t current_shard_count,
+                                 const uint32_t min_layout_shards,
                                   bool& need_resharding,
                                   uint32_t* suggested_num_shard_count = nullptr);
  
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc

index 82080c4a06f278f2db52f3f8a87a7c143f8e2e1e..429b209bbac271ffe8fce4ed544c02cc7b152e12 100644 (file)
--- a/src/rgw/driver/rados/rgw_reshard.cc
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -69,23 +69,13 @@ const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
  };
  
  
-uint32_t RGWBucketReshard::get_prime_shard_count(
-  uint32_t shard_count,
-  uint32_t max_dynamic_shards,
-  uint32_t min_dynamic_shards)
-{
+uint32_t RGWBucketReshard::nearest_prime(uint32_t shard_count)  {
    uint32_t prime_shard_count =
      get_prime_shards_greater_or_equal(shard_count);
  
    // if we cannot find a larger prime number, then just use what was
    // passed in
-  if (! prime_shard_count) {
-    prime_shard_count = shard_count;
-  }
-
-  // keep within min/max bounds
-  return std::min(max_dynamic_shards,
-                 std::max(min_dynamic_shards, prime_shard_count));
+  return prime_shard_count ? prime_shard_count : shard_count;
  }
  
  
@@ -96,6 +86,7 @@ uint32_t RGWBucketReshard::get_prime_shard_count(
  void RGWBucketReshard::calculate_preferred_shards(
    const DoutPrefixProvider* dpp,
    const uint32_t max_dynamic_shards,
+  const uint32_t min_layout_shards,
    const uint64_t max_objs_per_shard,
    const bool is_multisite,
    const uint64_t num_objs,
@@ -139,10 +130,13 @@ void RGWBucketReshard::calculate_preferred_shards(
    }
  
    if (prefer_prime) {
-    calculated_num_shards = get_prime_shard_count(
-      calculated_num_shards, max_dynamic_shards, min_dynamic_shards);
+    calculated_num_shards = nearest_prime(calculated_num_shards);
    }
  
+  calculated_num_shards =
+    std::min(max_dynamic_shards,
+            std::max({ calculated_num_shards, min_dynamic_shards, min_layout_shards }));
+
    ldpp_dout(dpp, 20) << __func__ << ": reshard " << verb <<
      " suggested; current average (objects/shard) is " <<
      float(num_objs) / current_num_shards << ", which is not within " <<
@@ -461,6 +455,7 @@ static int init_target_layout(rgw::sal::RadosStore* store,
    rgw::bucket_index_layout_generation target;
    target.layout.type = rgw::BucketIndexType::Normal;
    target.layout.normal.num_shards = new_num_shards;
+  target.layout.normal.min_num_shards = current.layout.normal.min_num_shards;
    target.gen = current.gen + 1;
  
    if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
@@ -1256,7 +1251,7 @@ int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& curr
      // block the client op and complete the resharding
      ceph_assert(bucket_info.layout.resharding == rgw::BucketReshardState::InProgress);
      ret = reshard_process(current, max_op_entries, target_shards_mgr, verbose_json_out, out,
-                              formatter, bucket_info.layout.resharding, dpp, y);
+                         formatter, bucket_info.layout.resharding, dpp, y);
      if (ret < 0) {
        ldpp_dout(dpp, 0) << __func__ << ": failed in progress state of reshard ret = " << ret << dendl;
        return ret;
@@ -1637,6 +1632,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
      ret = store->getRados()->get_bucket_stats(dpp, bucket_info,
                                               bucket_info.layout.current_index,
                                               -1, nullptr, nullptr, stats, nullptr, nullptr);
+    if (ret < 0) {
+      return clean_up("unable to access buckets current stats");
+    }
  
      // determine current number of bucket entries across shards
      uint64_t num_entries = 0;
@@ -1645,7 +1643,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
      }
  
      const uint32_t current_shard_count =
-      rgw::num_shards(bucket_info.get_current_index().layout.normal);
+      rgw::current_num_shards(bucket_info.layout);
+    const uint32_t min_layout_shards =
+      rgw::current_min_layout_shards(bucket_info.layout);
  
      bool needs_resharding { false };
      uint32_t suggested_shard_count { 0 };
@@ -1653,7 +1653,7 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
      // needed to perform the calculation before calling
      // calculating_preferred_shards() in this class
      store->getRados()->calculate_preferred_shards(
-      dpp, num_entries, current_shard_count,
+      dpp, num_entries, current_shard_count, min_layout_shards,
        needs_resharding, &suggested_shard_count);
  
      // if we no longer need resharding or currently need to expand
@@ -1711,7 +1711,6 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
    }
  
    // all checkes passed; we can reshard...
-
    RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
  
    ReshardFaultInjector f; // no fault injected
diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h

index 3d056e50f468d9860a4fcef9536ed710b75e7fce..bdd7f67873540a443609eff441d4f6553307eb24 100644 (file)
--- a/src/rgw/driver/rados/rgw_reshard.h
+++ b/src/rgw/driver/rados/rgw_reshard.h
@@ -160,14 +160,12 @@ public:
      }
    }
  
-  // returns a preferred number of shards given a calculated number of
-  // shards based on max_dynamic_shards and the list of prime values
-  static uint32_t get_prime_shard_count(uint32_t suggested_shards,
-                                       uint32_t max_dynamic_shards,
-                                       uint32_t min_dynamic_shards);
+  // returns a preferred number of shards as a prime value
+  static uint32_t nearest_prime(uint32_t suggested_shards);
  
    static void calculate_preferred_shards(const DoutPrefixProvider* dpp,
                                          const uint32_t max_dynamic_shards,
+                                        const uint32_t min_layout_shards,
                                          const uint64_t max_objs_per_shard,
                                          const bool is_multisite,
                                          const uint64_t num_objs,
diff --git a/src/rgw/rgw_bucket_layout.cc b/src/rgw/rgw_bucket_layout.cc

index 1f8db396a0d13487a96cd3ba825390ca80bb3853..10ff7200eaa30045078cf6e47f755c420b13214b 100644 (file)
--- a/src/rgw/rgw_bucket_layout.cc
+++ b/src/rgw/rgw_bucket_layout.cc
@@ -81,16 +81,20 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj)
  // bucket_index_normal_layout
  void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f)
  {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
    encode(l.num_shards, bl);
    encode(l.hash_type, bl);
+  encode(l.min_num_shards, bl);
    ENCODE_FINISH(bl);
  }
  void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl)
  {
-  DECODE_START(1, bl);
+  DECODE_START(2, bl);
    decode(l.num_shards, bl);
    decode(l.hash_type, bl);
+  if (struct_v >= 2) {
+    decode(l.min_num_shards, bl);
+  }
    DECODE_FINISH(bl);
  }
  void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f)
@@ -98,12 +102,16 @@ void encode_json_impl(const char *name, const bucket_index_normal_layout& l, cep
    f->open_object_section(name);
    encode_json("num_shards", l.num_shards, f);
    encode_json("hash_type", l.hash_type, f);
+  encode_json("min_num_shards", l.min_num_shards, f);
    f->close_section();
  }
  void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj)
  {
    JSONDecoder::decode_json("num_shards", l.num_shards, obj);
    JSONDecoder::decode_json("hash_type", l.hash_type, obj);
+
+  // if not set in json, set to default value of 1
+  JSONDecoder::decode_json("min_num_shards", l.min_num_shards, obj, 1);
  }
  
  // bucket_index_layout
diff --git a/src/rgw/rgw_bucket_layout.h b/src/rgw/rgw_bucket_layout.h

index b360dd32c3715fa70a4e7e91b6b18a06d16cf626..08bacc81b306b511cdc707315f0b6a5e3440e36a 100644 (file)
--- a/src/rgw/rgw_bucket_layout.h
+++ b/src/rgw/rgw_bucket_layout.h
@@ -54,10 +54,15 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj);
  struct bucket_index_normal_layout {
    uint32_t num_shards = 1;
  
+  // the fewest number of shards this bucket layout allows
+  uint32_t min_num_shards = 1;
+
    BucketHashType hash_type = BucketHashType::Mod;
  
-  friend std::ostream& operator<<(std::ostream& out, const bucket_index_normal_layout& l) {
-    out << "num_shards=" << l.num_shards << ", hash_type=" << to_string(l.hash_type);
+  friend std::ostream& operator<<(std::ostream& out,
+                                 const bucket_index_normal_layout& l) {
+    out << "num_shards=" << l.num_shards << ", min_num_shards=" <<
+      l.min_num_shards << ", hash_type=" << to_string(l.hash_type);
      return out;
    }
  };
@@ -278,9 +283,13 @@ inline uint32_t num_shards(const bucket_index_layout& index) {
  inline uint32_t num_shards(const bucket_index_layout_generation& index) {
    return num_shards(index.layout);
  }
+
  inline uint32_t current_num_shards(const BucketLayout& layout) {
    return num_shards(layout.current_index);
  }
+inline uint32_t current_min_layout_shards(const BucketLayout& layout) {
+  return layout.current_index.layout.normal.min_num_shards;
+}
  inline bool is_layout_indexless(const bucket_index_layout_generation& layout) {
    return layout.layout.type == BucketIndexType::Indexless;
  }
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h

index 51e4f385ea4be0856fb0c154cbac73e98ff25ea8..73f4923b84036bb22318b1e791eea4c2efe3c4fe 100644 (file)
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -1045,6 +1045,9 @@ class RGWSI_Zone;
  
  #include "rgw_cksum.h"
  
+
+// this represents the at-rest bucket instance object and is stored as
+// a system object
  struct RGWBucketInfo {
    rgw_bucket bucket;
    rgw_owner owner;
diff --git a/src/test/rgw/test_rgw_reshard.cc b/src/test/rgw/test_rgw_reshard.cc

index 3513e644aa88dab5d234184f0334c5eaf0731f20..d014fb33aa1a619fee2c27165092aa7d85707c15 100644 (file)
--- a/src/test/rgw/test_rgw_reshard.cc
+++ b/src/test/rgw/test_rgw_reshard.cc
@@ -16,12 +16,15 @@
  #include <gtest/gtest.h>
  
  
-TEST(TestRGWReshard, dynamic_reshard_shard_count)
+TEST(TestRGWReshard, max_prime_shards)
  {
    // assuming we have prime numbers up to 1999
    ASSERT_EQ(1999u, RGWBucketReshard::get_max_prime_shards()) <<
      "initial list has primes up to 1999";
+}
  
+TEST(TestRGWReshard, prime_lookups)
+{
    ASSERT_EQ(1u, RGWBucketReshard::get_prime_shards_greater_or_equal(1)) <<
      "we allow for 1 shard even though it's not prime";
    ASSERT_EQ(809u, RGWBucketReshard::get_prime_shards_greater_or_equal(808)) <<
@@ -47,24 +50,72 @@ TEST(TestRGWReshard, dynamic_reshard_shard_count)
      "811 is prime";
    ASSERT_EQ(811u, RGWBucketReshard::get_prime_shards_less_or_equal(812)) <<
      "821 is prime";
+}
  
+TEST(TestRGWReshard, nearest_prime)
+{
    // tests when max dynamic shards is equal to end of prime list
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 1999, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 1999, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(2000, 1999, 11));
-
-  // tests when max dynamic shards is above end of prime list
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 3000, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 3000, 11));
-  ASSERT_EQ(2000u, RGWBucketReshard::get_prime_shard_count(2000, 3000, 11));
-  ASSERT_EQ(2001u, RGWBucketReshard::get_prime_shard_count(2001, 3000, 11));
-
-  // tests when max dynamic shards is below end of prime list
-  ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(1998, 500, 11));
-  ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(2001, 500, 11));
-
-  // tests when max dynamic shards is below end of prime list
-  ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(498, 1999, 499));
-  ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(499, 1999, 499));
-  ASSERT_EQ(503u, RGWBucketReshard::get_prime_shard_count(500, 1999, 499));
+
+  ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(238));
+  ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(239));
+  ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(240));
+  ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(241));
+  ASSERT_EQ(251u, RGWBucketReshard::nearest_prime(242));
+
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1995));
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1996));
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1997));
+  ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1998));
+  ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1999));
+  ASSERT_EQ(2000u, RGWBucketReshard::nearest_prime(2000));
+}
+
+TEST(TestRGWReshard, calculate_preferred_shards)
+{
+  bool needs_resharding;
+  uint32_t suggested_shard_count = 0;
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 10000000, 200,
+                                              needs_resharding, &suggested_shard_count);
+
+  ASSERT_EQ(false, needs_resharding) << "no need to reshard when shards are half-used";
+
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
+                                              needs_resharding, &suggested_shard_count, false);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(404, suggested_shard_count) << "number of shards when primes are not preferred";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
+                                              needs_resharding, &suggested_shard_count, true);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(409, suggested_shard_count) << "number of shards when primes are preferred";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, true, 20200000, 200,
+                                              needs_resharding, &suggested_shard_count, true);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(1619, suggested_shard_count) <<
+    "number of shards under multisite with primes preferred since "
+    "multisite quadruples number of shards to reduce need to reshaard";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 650000, 700,
+                                              needs_resharding, &suggested_shard_count, true);
+  // 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
+  // 50000 = 13
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(13, suggested_shard_count) << "shard reduction without hitting min_layout_shards";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 350000, 400,
+                                              needs_resharding, &suggested_shard_count, true);
+  // 350,000 objs across 400 shards -> <1000 objs per shard; 350000 /
+  // 50000 = 7, but hard-coded minimum of 11
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(11, suggested_shard_count) << "shard reduction and hitting hard-coded minimum of 11";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 51, 100000, false, 650000, 700,
+                                              needs_resharding, &suggested_shard_count, true);
+  // 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
+  // 50000 = 13, but bucket min of 51
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(51, suggested_shard_count) << "shard reduction and hitting min_layout_shards";
  }
author	J. Eric Ivancich <ivancich@redhat.com>
	Wed, 15 Jan 2025 16:26:59 +0000 (11:26 -0500)
committer	J. Eric Ivancich <ivancich@redhat.com>
	Fri, 28 Feb 2025 19:31:34 +0000 (14:31 -0500)
src/rgw/driver/rados/rgw_bucket.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_rados.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_rados.h		patch \| blob \| history
src/rgw/driver/rados/rgw_reshard.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_reshard.h		patch \| blob \| history
src/rgw/rgw_bucket_layout.cc		patch \| blob \| history
src/rgw/rgw_bucket_layout.h		patch \| blob \| history
src/rgw/rgw_common.h		patch \| blob \| history
src/test/rgw/test_rgw_reshard.cc		patch \| blob \| history