]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw: allow per-bucket minimum number of shards
authorJ. Eric Ivancich <ivancich@redhat.com>
Wed, 15 Jan 2025 16:26:59 +0000 (11:26 -0500)
committerJ. Eric Ivancich <ivancich@redhat.com>
Fri, 28 Feb 2025 19:31:34 +0000 (14:31 -0500)
Dynamic resharding can now reduce the number of shards. The code
currently has a hard-coded value of 11 as the minimum number of shards
dynamic resharding can reshard to. There may be cases where the user
wants to set an alternate minimum, such as when they have a sense of
how many objects the bucket will eventually hold.

This PR builds off of https://github.com/ceph/ceph/pull/61269 .

That PR allows the user to specify an initial number of shards during
bucket creation. This PR then takes that number to be the minimum and
saves it in the layout field of the bucket instance object
(RGWBucketInfo).

When dynamic resharding is triggered, it will use that stored value as
a minimum number of shards for resharing.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
src/rgw/driver/rados/rgw_bucket.cc
src/rgw/driver/rados/rgw_rados.cc
src/rgw/driver/rados/rgw_rados.h
src/rgw/driver/rados/rgw_reshard.cc
src/rgw/driver/rados/rgw_reshard.h
src/rgw/rgw_bucket_layout.cc
src/rgw/rgw_bucket_layout.h
src/rgw/rgw_common.h
src/test/rgw/test_rgw_reshard.cc

index e460739ac9f6ff3723c562783f8084d0db54dd02..e444521d395c94c68f17eef8a1f360fbc9373e9f 100644 (file)
@@ -2815,6 +2815,7 @@ void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
 
   if (shards) {
     layout.current_index.layout.normal.num_shards = *shards;
+    layout.current_index.layout.normal.min_num_shards = *shards;
   } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
     layout.current_index.layout.normal.num_shards =
       cct->_conf->rgw_override_bucket_index_max_shards;
index 5eb2f9d0a075a51f33d680a4c042ad17952cc468..959e2f7d9943a9f67d72fe69e0e33be5cefa7bad 100644 (file)
@@ -10823,6 +10823,7 @@ int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBuck
 void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
                                          const uint64_t num_objs,
                                          const uint32_t num_source_shards,
+                                         const uint32_t min_layout_shards,
                                          bool& need_resharding,
                                          uint32_t* suggested_num_shards)
 {
@@ -10834,6 +10835,7 @@ void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
 
   RGWBucketReshard::calculate_preferred_shards(dpp,
                                               max_dynamic_shards,
+                                              min_layout_shards,
                                               max_objs_per_shard,
                                               is_multisite,
                                               num_objs,
@@ -10867,8 +10869,11 @@ int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
   uint32_t suggested_num_shards = 0;
   const uint32_t num_source_shards =
     rgw::current_num_shards(bucket_info.layout);
+  const uint32_t min_layout_shards =
+    rgw::current_min_layout_shards(bucket_info.layout);
 
-  calculate_preferred_shards(dpp, num_objs, num_source_shards,
+  calculate_preferred_shards(dpp, num_objs,
+                            num_source_shards, min_layout_shards,
                             need_resharding, &suggested_num_shards);
   if (! need_resharding) {
     return 0;
index 868e5c95eacb62ef7499291712667f26950bb3d7..22bc71690d194cd2d15a4a344fd6d3cf46b20637 100644 (file)
@@ -1630,6 +1630,7 @@ public:
   void calculate_preferred_shards(const DoutPrefixProvider* dpp,
                                  const uint64_t num_objs,
                                  const uint32_t current_shard_count,
+                                 const uint32_t min_layout_shards,
                                  bool& need_resharding,
                                  uint32_t* suggested_num_shard_count = nullptr);
 
index 82080c4a06f278f2db52f3f8a87a7c143f8e2e1e..429b209bbac271ffe8fce4ed544c02cc7b152e12 100644 (file)
@@ -69,23 +69,13 @@ const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
 };
 
 
-uint32_t RGWBucketReshard::get_prime_shard_count(
-  uint32_t shard_count,
-  uint32_t max_dynamic_shards,
-  uint32_t min_dynamic_shards)
-{
+uint32_t RGWBucketReshard::nearest_prime(uint32_t shard_count)  {
   uint32_t prime_shard_count =
     get_prime_shards_greater_or_equal(shard_count);
 
   // if we cannot find a larger prime number, then just use what was
   // passed in
-  if (! prime_shard_count) {
-    prime_shard_count = shard_count;
-  }
-
-  // keep within min/max bounds
-  return std::min(max_dynamic_shards,
-                 std::max(min_dynamic_shards, prime_shard_count));
+  return prime_shard_count ? prime_shard_count : shard_count;
 }
 
 
@@ -96,6 +86,7 @@ uint32_t RGWBucketReshard::get_prime_shard_count(
 void RGWBucketReshard::calculate_preferred_shards(
   const DoutPrefixProvider* dpp,
   const uint32_t max_dynamic_shards,
+  const uint32_t min_layout_shards,
   const uint64_t max_objs_per_shard,
   const bool is_multisite,
   const uint64_t num_objs,
@@ -139,10 +130,13 @@ void RGWBucketReshard::calculate_preferred_shards(
   }
 
   if (prefer_prime) {
-    calculated_num_shards = get_prime_shard_count(
-      calculated_num_shards, max_dynamic_shards, min_dynamic_shards);
+    calculated_num_shards = nearest_prime(calculated_num_shards);
   }
 
+  calculated_num_shards =
+    std::min(max_dynamic_shards,
+            std::max({ calculated_num_shards, min_dynamic_shards, min_layout_shards }));
+
   ldpp_dout(dpp, 20) << __func__ << ": reshard " << verb <<
     " suggested; current average (objects/shard) is " <<
     float(num_objs) / current_num_shards << ", which is not within " <<
@@ -461,6 +455,7 @@ static int init_target_layout(rgw::sal::RadosStore* store,
   rgw::bucket_index_layout_generation target;
   target.layout.type = rgw::BucketIndexType::Normal;
   target.layout.normal.num_shards = new_num_shards;
+  target.layout.normal.min_num_shards = current.layout.normal.min_num_shards;
   target.gen = current.gen + 1;
 
   if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
@@ -1256,7 +1251,7 @@ int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& curr
     // block the client op and complete the resharding
     ceph_assert(bucket_info.layout.resharding == rgw::BucketReshardState::InProgress);
     ret = reshard_process(current, max_op_entries, target_shards_mgr, verbose_json_out, out,
-                              formatter, bucket_info.layout.resharding, dpp, y);
+                         formatter, bucket_info.layout.resharding, dpp, y);
     if (ret < 0) {
       ldpp_dout(dpp, 0) << __func__ << ": failed in progress state of reshard ret = " << ret << dendl;
       return ret;
@@ -1637,6 +1632,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     ret = store->getRados()->get_bucket_stats(dpp, bucket_info,
                                              bucket_info.layout.current_index,
                                              -1, nullptr, nullptr, stats, nullptr, nullptr);
+    if (ret < 0) {
+      return clean_up("unable to access buckets current stats");
+    }
 
     // determine current number of bucket entries across shards
     uint64_t num_entries = 0;
@@ -1645,7 +1643,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     }
 
     const uint32_t current_shard_count =
-      rgw::num_shards(bucket_info.get_current_index().layout.normal);
+      rgw::current_num_shards(bucket_info.layout);
+    const uint32_t min_layout_shards =
+      rgw::current_min_layout_shards(bucket_info.layout);
 
     bool needs_resharding { false };
     uint32_t suggested_shard_count { 0 };
@@ -1653,7 +1653,7 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     // needed to perform the calculation before calling
     // calculating_preferred_shards() in this class
     store->getRados()->calculate_preferred_shards(
-      dpp, num_entries, current_shard_count,
+      dpp, num_entries, current_shard_count, min_layout_shards,
       needs_resharding, &suggested_shard_count);
 
     // if we no longer need resharding or currently need to expand
@@ -1711,7 +1711,6 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
   }
 
   // all checkes passed; we can reshard...
-
   RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
 
   ReshardFaultInjector f; // no fault injected
index 3d056e50f468d9860a4fcef9536ed710b75e7fce..bdd7f67873540a443609eff441d4f6553307eb24 100644 (file)
@@ -160,14 +160,12 @@ public:
     }
   }
 
-  // returns a preferred number of shards given a calculated number of
-  // shards based on max_dynamic_shards and the list of prime values
-  static uint32_t get_prime_shard_count(uint32_t suggested_shards,
-                                       uint32_t max_dynamic_shards,
-                                       uint32_t min_dynamic_shards);
+  // returns a preferred number of shards as a prime value
+  static uint32_t nearest_prime(uint32_t suggested_shards);
 
   static void calculate_preferred_shards(const DoutPrefixProvider* dpp,
                                         const uint32_t max_dynamic_shards,
+                                        const uint32_t min_layout_shards,
                                         const uint64_t max_objs_per_shard,
                                         const bool is_multisite,
                                         const uint64_t num_objs,
index 1f8db396a0d13487a96cd3ba825390ca80bb3853..10ff7200eaa30045078cf6e47f755c420b13214b 100644 (file)
@@ -81,16 +81,20 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj)
 // bucket_index_normal_layout
 void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f)
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   encode(l.num_shards, bl);
   encode(l.hash_type, bl);
+  encode(l.min_num_shards, bl);
   ENCODE_FINISH(bl);
 }
 void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl)
 {
-  DECODE_START(1, bl);
+  DECODE_START(2, bl);
   decode(l.num_shards, bl);
   decode(l.hash_type, bl);
+  if (struct_v >= 2) {
+    decode(l.min_num_shards, bl);
+  }
   DECODE_FINISH(bl);
 }
 void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f)
@@ -98,12 +102,16 @@ void encode_json_impl(const char *name, const bucket_index_normal_layout& l, cep
   f->open_object_section(name);
   encode_json("num_shards", l.num_shards, f);
   encode_json("hash_type", l.hash_type, f);
+  encode_json("min_num_shards", l.min_num_shards, f);
   f->close_section();
 }
 void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj)
 {
   JSONDecoder::decode_json("num_shards", l.num_shards, obj);
   JSONDecoder::decode_json("hash_type", l.hash_type, obj);
+
+  // if not set in json, set to default value of 1
+  JSONDecoder::decode_json("min_num_shards", l.min_num_shards, obj, 1);
 }
 
 // bucket_index_layout
index b360dd32c3715fa70a4e7e91b6b18a06d16cf626..08bacc81b306b511cdc707315f0b6a5e3440e36a 100644 (file)
@@ -54,10 +54,15 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj);
 struct bucket_index_normal_layout {
   uint32_t num_shards = 1;
 
+  // the fewest number of shards this bucket layout allows
+  uint32_t min_num_shards = 1;
+
   BucketHashType hash_type = BucketHashType::Mod;
 
-  friend std::ostream& operator<<(std::ostream& out, const bucket_index_normal_layout& l) {
-    out << "num_shards=" << l.num_shards << ", hash_type=" << to_string(l.hash_type);
+  friend std::ostream& operator<<(std::ostream& out,
+                                 const bucket_index_normal_layout& l) {
+    out << "num_shards=" << l.num_shards << ", min_num_shards=" <<
+      l.min_num_shards << ", hash_type=" << to_string(l.hash_type);
     return out;
   }
 };
@@ -278,9 +283,13 @@ inline uint32_t num_shards(const bucket_index_layout& index) {
 inline uint32_t num_shards(const bucket_index_layout_generation& index) {
   return num_shards(index.layout);
 }
+
 inline uint32_t current_num_shards(const BucketLayout& layout) {
   return num_shards(layout.current_index);
 }
+inline uint32_t current_min_layout_shards(const BucketLayout& layout) {
+  return layout.current_index.layout.normal.min_num_shards;
+}
 inline bool is_layout_indexless(const bucket_index_layout_generation& layout) {
   return layout.layout.type == BucketIndexType::Indexless;
 }
index 51e4f385ea4be0856fb0c154cbac73e98ff25ea8..73f4923b84036bb22318b1e791eea4c2efe3c4fe 100644 (file)
@@ -1045,6 +1045,9 @@ class RGWSI_Zone;
 
 #include "rgw_cksum.h"
 
+
+// this represents the at-rest bucket instance object and is stored as
+// a system object
 struct RGWBucketInfo {
   rgw_bucket bucket;
   rgw_owner owner;
index 3513e644aa88dab5d234184f0334c5eaf0731f20..d014fb33aa1a619fee2c27165092aa7d85707c15 100644 (file)
 #include <gtest/gtest.h>
 
 
-TEST(TestRGWReshard, dynamic_reshard_shard_count)
+TEST(TestRGWReshard, max_prime_shards)
 {
   // assuming we have prime numbers up to 1999
   ASSERT_EQ(1999u, RGWBucketReshard::get_max_prime_shards()) <<
     "initial list has primes up to 1999";
+}
 
+TEST(TestRGWReshard, prime_lookups)
+{
   ASSERT_EQ(1u, RGWBucketReshard::get_prime_shards_greater_or_equal(1)) <<
     "we allow for 1 shard even though it's not prime";
   ASSERT_EQ(809u, RGWBucketReshard::get_prime_shards_greater_or_equal(808)) <<
@@ -47,24 +50,72 @@ TEST(TestRGWReshard, dynamic_reshard_shard_count)
     "811 is prime";
   ASSERT_EQ(811u, RGWBucketReshard::get_prime_shards_less_or_equal(812)) <<
     "821 is prime";
+}
 
+TEST(TestRGWReshard, nearest_prime)
+{
   // tests when max dynamic shards is equal to end of prime list
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 1999, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 1999, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(2000, 1999, 11));
-
-  // tests when max dynamic shards is above end of prime list
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 3000, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 3000, 11));
-  ASSERT_EQ(2000u, RGWBucketReshard::get_prime_shard_count(2000, 3000, 11));
-  ASSERT_EQ(2001u, RGWBucketReshard::get_prime_shard_count(2001, 3000, 11));
-
-  // tests when max dynamic shards is below end of prime list
-  ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(1998, 500, 11));
-  ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(2001, 500, 11));
-
-  // tests when max dynamic shards is below end of prime list
-  ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(498, 1999, 499));
-  ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(499, 1999, 499));
-  ASSERT_EQ(503u, RGWBucketReshard::get_prime_shard_count(500, 1999, 499));
+
+  ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(238));
+  ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(239));
+  ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(240));
+  ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(241));
+  ASSERT_EQ(251u, RGWBucketReshard::nearest_prime(242));
+
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1995));
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1996));
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1997));
+  ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1998));
+  ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1999));
+  ASSERT_EQ(2000u, RGWBucketReshard::nearest_prime(2000));
+}
+
+TEST(TestRGWReshard, calculate_preferred_shards)
+{
+  bool needs_resharding;
+  uint32_t suggested_shard_count = 0;
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 10000000, 200,
+                                              needs_resharding, &suggested_shard_count);
+
+  ASSERT_EQ(false, needs_resharding) << "no need to reshard when shards are half-used";
+
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
+                                              needs_resharding, &suggested_shard_count, false);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(404, suggested_shard_count) << "number of shards when primes are not preferred";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
+                                              needs_resharding, &suggested_shard_count, true);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(409, suggested_shard_count) << "number of shards when primes are preferred";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, true, 20200000, 200,
+                                              needs_resharding, &suggested_shard_count, true);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(1619, suggested_shard_count) <<
+    "number of shards under multisite with primes preferred since "
+    "multisite quadruples number of shards to reduce need to reshaard";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 650000, 700,
+                                              needs_resharding, &suggested_shard_count, true);
+  // 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
+  // 50000 = 13
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(13, suggested_shard_count) << "shard reduction without hitting min_layout_shards";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 350000, 400,
+                                              needs_resharding, &suggested_shard_count, true);
+  // 350,000 objs across 400 shards -> <1000 objs per shard; 350000 /
+  // 50000 = 7, but hard-coded minimum of 11
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(11, suggested_shard_count) << "shard reduction and hitting hard-coded minimum of 11";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 51, 100000, false, 650000, 700,
+                                              needs_resharding, &suggested_shard_count, true);
+  // 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
+  // 50000 = 13, but bucket min of 51
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(51, suggested_shard_count) << "shard reduction and hitting min_layout_shards";
 }