From 5ee4a92380ee351dec501b09e251cb9c86d023f8 Mon Sep 17 00:00:00 2001
From: "J. Eric Ivancich" <ivancich@redhat.com>
Date: Wed, 15 Jan 2025 11:26:59 -0500
Subject: [PATCH] rgw: allow per-bucket minimum number of shards

Dynamic resharding can now reduce the number of shards. The code
currently has a hard-coded value of 11 as the minimum number of shards
dynamic resharding can reshard to. There may be cases where the user
wants to set an alternate minimum, such as when they have a sense of
how many objects the bucket will eventually hold.

This PR builds off of https://github.com/ceph/ceph/pull/61269 .

That PR allows the user to specify an initial number of shards during
bucket creation. This PR then takes that number to be the minimum and
saves it in the layout field of the bucket instance object
(RGWBucketInfo).

When dynamic resharding is triggered, it will use that stored value as
a minimum number of shards for resharing.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
---
 src/rgw/driver/rados/rgw_bucket.cc  |  1 +
 src/rgw/driver/rados/rgw_rados.cc   |  7 ++-
 src/rgw/driver/rados/rgw_rados.h    |  1 +
 src/rgw/driver/rados/rgw_reshard.cc | 35 ++++++------
 src/rgw/driver/rados/rgw_reshard.h  |  8 +--
 src/rgw/rgw_bucket_layout.cc        | 12 +++-
 src/rgw/rgw_bucket_layout.h         | 13 ++++-
 src/rgw/rgw_common.h                |  3 +
 src/test/rgw/test_rgw_reshard.cc    | 89 +++++++++++++++++++++++------
 9 files changed, 122 insertions(+), 47 deletions(-)

diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc
index e460739ac9f..e444521d395 100644
--- a/src/rgw/driver/rados/rgw_bucket.cc
+++ b/src/rgw/driver/rados/rgw_bucket.cc
@@ -2815,6 +2815,7 @@ void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
 
   if (shards) {
     layout.current_index.layout.normal.num_shards = *shards;
+    layout.current_index.layout.normal.min_num_shards = *shards;
   } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
     layout.current_index.layout.normal.num_shards =
       cct->_conf->rgw_override_bucket_index_max_shards;
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index 5eb2f9d0a07..959e2f7d994 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -10823,6 +10823,7 @@ int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBuck
 void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
 					  const uint64_t num_objs,
 					  const uint32_t num_source_shards,
+					  const uint32_t min_layout_shards,
 					  bool& need_resharding,
 					  uint32_t* suggested_num_shards)
 {
@@ -10834,6 +10835,7 @@ void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
 
   RGWBucketReshard::calculate_preferred_shards(dpp,
 					       max_dynamic_shards,
+					       min_layout_shards,
 					       max_objs_per_shard,
 					       is_multisite,
 					       num_objs,
@@ -10867,8 +10869,11 @@ int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
   uint32_t suggested_num_shards = 0;
   const uint32_t num_source_shards =
     rgw::current_num_shards(bucket_info.layout);
+  const uint32_t min_layout_shards =
+    rgw::current_min_layout_shards(bucket_info.layout);
 
-  calculate_preferred_shards(dpp, num_objs, num_source_shards,
+  calculate_preferred_shards(dpp, num_objs,
+			     num_source_shards, min_layout_shards,
 			     need_resharding, &suggested_num_shards);
   if (! need_resharding) {
     return 0;
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index 868e5c95eac..22bc71690d1 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -1630,6 +1630,7 @@ public:
   void calculate_preferred_shards(const DoutPrefixProvider* dpp,
 				  const uint64_t num_objs,
 				  const uint32_t current_shard_count,
+				  const uint32_t min_layout_shards,
 				  bool& need_resharding,
 				  uint32_t* suggested_num_shard_count = nullptr);
 
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc
index 82080c4a06f..429b209bbac 100644
--- a/src/rgw/driver/rados/rgw_reshard.cc
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -69,23 +69,13 @@ const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
 };
 
 
-uint32_t RGWBucketReshard::get_prime_shard_count(
-  uint32_t shard_count,
-  uint32_t max_dynamic_shards,
-  uint32_t min_dynamic_shards)
-{
+uint32_t RGWBucketReshard::nearest_prime(uint32_t shard_count)  {
   uint32_t prime_shard_count =
     get_prime_shards_greater_or_equal(shard_count);
 
   // if we cannot find a larger prime number, then just use what was
   // passed in
-  if (! prime_shard_count) {
-    prime_shard_count = shard_count;
-  }
-
-  // keep within min/max bounds
-  return std::min(max_dynamic_shards,
-		  std::max(min_dynamic_shards, prime_shard_count));
+  return prime_shard_count ? prime_shard_count : shard_count;
 }
 
 
@@ -96,6 +86,7 @@ uint32_t RGWBucketReshard::get_prime_shard_count(
 void RGWBucketReshard::calculate_preferred_shards(
   const DoutPrefixProvider* dpp,
   const uint32_t max_dynamic_shards,
+  const uint32_t min_layout_shards,
   const uint64_t max_objs_per_shard,
   const bool is_multisite,
   const uint64_t num_objs,
@@ -139,10 +130,13 @@ void RGWBucketReshard::calculate_preferred_shards(
   }
 
   if (prefer_prime) {
-    calculated_num_shards = get_prime_shard_count(
-      calculated_num_shards, max_dynamic_shards, min_dynamic_shards);
+    calculated_num_shards = nearest_prime(calculated_num_shards);
   }
 
+  calculated_num_shards =
+    std::min(max_dynamic_shards,
+	     std::max({ calculated_num_shards, min_dynamic_shards, min_layout_shards }));
+
   ldpp_dout(dpp, 20) << __func__ << ": reshard " << verb <<
     " suggested; current average (objects/shard) is " <<
     float(num_objs) / current_num_shards << ", which is not within " <<
@@ -461,6 +455,7 @@ static int init_target_layout(rgw::sal::RadosStore* store,
   rgw::bucket_index_layout_generation target;
   target.layout.type = rgw::BucketIndexType::Normal;
   target.layout.normal.num_shards = new_num_shards;
+  target.layout.normal.min_num_shards = current.layout.normal.min_num_shards;
   target.gen = current.gen + 1;
 
   if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
@@ -1256,7 +1251,7 @@ int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& curr
     // block the client op and complete the resharding
     ceph_assert(bucket_info.layout.resharding == rgw::BucketReshardState::InProgress);
     ret = reshard_process(current, max_op_entries, target_shards_mgr, verbose_json_out, out,
-                              formatter, bucket_info.layout.resharding, dpp, y);
+			  formatter, bucket_info.layout.resharding, dpp, y);
     if (ret < 0) {
       ldpp_dout(dpp, 0) << __func__ << ": failed in progress state of reshard ret = " << ret << dendl;
       return ret;
@@ -1637,6 +1632,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     ret = store->getRados()->get_bucket_stats(dpp, bucket_info,
 					      bucket_info.layout.current_index,
 					      -1, nullptr, nullptr, stats, nullptr, nullptr);
+    if (ret < 0) {
+      return clean_up("unable to access buckets current stats");
+    }
 
     // determine current number of bucket entries across shards
     uint64_t num_entries = 0;
@@ -1645,7 +1643,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     }
 
     const uint32_t current_shard_count =
-      rgw::num_shards(bucket_info.get_current_index().layout.normal);
+      rgw::current_num_shards(bucket_info.layout);
+    const uint32_t min_layout_shards =
+      rgw::current_min_layout_shards(bucket_info.layout);
 
     bool needs_resharding { false };
     uint32_t suggested_shard_count { 0 };
@@ -1653,7 +1653,7 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
     // needed to perform the calculation before calling
     // calculating_preferred_shards() in this class
     store->getRados()->calculate_preferred_shards(
-      dpp, num_entries, current_shard_count,
+      dpp, num_entries, current_shard_count, min_layout_shards,
       needs_resharding, &suggested_shard_count);
 
     // if we no longer need resharding or currently need to expand
@@ -1711,7 +1711,6 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
   }
 
   // all checkes passed; we can reshard...
-
   RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
 
   ReshardFaultInjector f; // no fault injected
diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h
index 3d056e50f46..bdd7f678735 100644
--- a/src/rgw/driver/rados/rgw_reshard.h
+++ b/src/rgw/driver/rados/rgw_reshard.h
@@ -160,14 +160,12 @@ public:
     }
   }
 
-  // returns a preferred number of shards given a calculated number of
-  // shards based on max_dynamic_shards and the list of prime values
-  static uint32_t get_prime_shard_count(uint32_t suggested_shards,
-					uint32_t max_dynamic_shards,
-					uint32_t min_dynamic_shards);
+  // returns a preferred number of shards as a prime value
+  static uint32_t nearest_prime(uint32_t suggested_shards);
 
   static void calculate_preferred_shards(const DoutPrefixProvider* dpp,
 					 const uint32_t max_dynamic_shards,
+					 const uint32_t min_layout_shards,
 					 const uint64_t max_objs_per_shard,
 					 const bool is_multisite,
 					 const uint64_t num_objs,
diff --git a/src/rgw/rgw_bucket_layout.cc b/src/rgw/rgw_bucket_layout.cc
index 1f8db396a0d..10ff7200eaa 100644
--- a/src/rgw/rgw_bucket_layout.cc
+++ b/src/rgw/rgw_bucket_layout.cc
@@ -81,16 +81,20 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj)
 // bucket_index_normal_layout
 void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f)
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   encode(l.num_shards, bl);
   encode(l.hash_type, bl);
+  encode(l.min_num_shards, bl);
   ENCODE_FINISH(bl);
 }
 void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl)
 {
-  DECODE_START(1, bl);
+  DECODE_START(2, bl);
   decode(l.num_shards, bl);
   decode(l.hash_type, bl);
+  if (struct_v >= 2) {
+    decode(l.min_num_shards, bl);
+  }
   DECODE_FINISH(bl);
 }
 void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f)
@@ -98,12 +102,16 @@ void encode_json_impl(const char *name, const bucket_index_normal_layout& l, cep
   f->open_object_section(name);
   encode_json("num_shards", l.num_shards, f);
   encode_json("hash_type", l.hash_type, f);
+  encode_json("min_num_shards", l.min_num_shards, f);
   f->close_section();
 }
 void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj)
 {
   JSONDecoder::decode_json("num_shards", l.num_shards, obj);
   JSONDecoder::decode_json("hash_type", l.hash_type, obj);
+
+  // if not set in json, set to default value of 1
+  JSONDecoder::decode_json("min_num_shards", l.min_num_shards, obj, 1);
 }
 
 // bucket_index_layout
diff --git a/src/rgw/rgw_bucket_layout.h b/src/rgw/rgw_bucket_layout.h
index b360dd32c37..08bacc81b30 100644
--- a/src/rgw/rgw_bucket_layout.h
+++ b/src/rgw/rgw_bucket_layout.h
@@ -54,10 +54,15 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj);
 struct bucket_index_normal_layout {
   uint32_t num_shards = 1;
 
+  // the fewest number of shards this bucket layout allows
+  uint32_t min_num_shards = 1;
+
   BucketHashType hash_type = BucketHashType::Mod;
 
-  friend std::ostream& operator<<(std::ostream& out, const bucket_index_normal_layout& l) {
-    out << "num_shards=" << l.num_shards << ", hash_type=" << to_string(l.hash_type);
+  friend std::ostream& operator<<(std::ostream& out,
+				  const bucket_index_normal_layout& l) {
+    out << "num_shards=" << l.num_shards << ", min_num_shards=" <<
+      l.min_num_shards << ", hash_type=" << to_string(l.hash_type);
     return out;
   }
 };
@@ -278,9 +283,13 @@ inline uint32_t num_shards(const bucket_index_layout& index) {
 inline uint32_t num_shards(const bucket_index_layout_generation& index) {
   return num_shards(index.layout);
 }
+
 inline uint32_t current_num_shards(const BucketLayout& layout) {
   return num_shards(layout.current_index);
 }
+inline uint32_t current_min_layout_shards(const BucketLayout& layout) {
+  return layout.current_index.layout.normal.min_num_shards;
+}
 inline bool is_layout_indexless(const bucket_index_layout_generation& layout) {
   return layout.layout.type == BucketIndexType::Indexless;
 }
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 51e4f385ea4..73f4923b840 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -1045,6 +1045,9 @@ class RGWSI_Zone;
 
 #include "rgw_cksum.h"
 
+
+// this represents the at-rest bucket instance object and is stored as
+// a system object
 struct RGWBucketInfo {
   rgw_bucket bucket;
   rgw_owner owner;
diff --git a/src/test/rgw/test_rgw_reshard.cc b/src/test/rgw/test_rgw_reshard.cc
index 3513e644aa8..d014fb33aa1 100644
--- a/src/test/rgw/test_rgw_reshard.cc
+++ b/src/test/rgw/test_rgw_reshard.cc
@@ -16,12 +16,15 @@
 #include <gtest/gtest.h>
 
 
-TEST(TestRGWReshard, dynamic_reshard_shard_count)
+TEST(TestRGWReshard, max_prime_shards)
 {
   // assuming we have prime numbers up to 1999
   ASSERT_EQ(1999u, RGWBucketReshard::get_max_prime_shards()) <<
     "initial list has primes up to 1999";
+}
 
+TEST(TestRGWReshard, prime_lookups)
+{
   ASSERT_EQ(1u, RGWBucketReshard::get_prime_shards_greater_or_equal(1)) <<
     "we allow for 1 shard even though it's not prime";
   ASSERT_EQ(809u, RGWBucketReshard::get_prime_shards_greater_or_equal(808)) <<
@@ -47,24 +50,72 @@ TEST(TestRGWReshard, dynamic_reshard_shard_count)
     "811 is prime";
   ASSERT_EQ(811u, RGWBucketReshard::get_prime_shards_less_or_equal(812)) <<
     "821 is prime";
+}
 
+TEST(TestRGWReshard, nearest_prime)
+{
   // tests when max dynamic shards is equal to end of prime list
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 1999, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 1999, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(2000, 1999, 11));
-
-  // tests when max dynamic shards is above end of prime list
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 3000, 11));
-  ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 3000, 11));
-  ASSERT_EQ(2000u, RGWBucketReshard::get_prime_shard_count(2000, 3000, 11));
-  ASSERT_EQ(2001u, RGWBucketReshard::get_prime_shard_count(2001, 3000, 11));
-
-  // tests when max dynamic shards is below end of prime list
-  ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(1998, 500, 11));
-  ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(2001, 500, 11));
-
-  // tests when max dynamic shards is below end of prime list
-  ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(498, 1999, 499));
-  ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(499, 1999, 499));
-  ASSERT_EQ(503u, RGWBucketReshard::get_prime_shard_count(500, 1999, 499));
+
+  ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(238));
+  ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(239));
+  ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(240));
+  ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(241));
+  ASSERT_EQ(251u, RGWBucketReshard::nearest_prime(242));
+
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1995));
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1996));
+  ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1997));
+  ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1998));
+  ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1999));
+  ASSERT_EQ(2000u, RGWBucketReshard::nearest_prime(2000));
+}
+
+TEST(TestRGWReshard, calculate_preferred_shards)
+{
+  bool needs_resharding;
+  uint32_t suggested_shard_count = 0;
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 10000000, 200,
+					       needs_resharding, &suggested_shard_count);
+
+  ASSERT_EQ(false, needs_resharding) << "no need to reshard when shards are half-used";
+
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
+					       needs_resharding, &suggested_shard_count, false);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(404, suggested_shard_count) << "number of shards when primes are not preferred";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
+					       needs_resharding, &suggested_shard_count, true);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(409, suggested_shard_count) << "number of shards when primes are preferred";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, true, 20200000, 200,
+					       needs_resharding, &suggested_shard_count, true);
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(1619, suggested_shard_count) <<
+    "number of shards under multisite with primes preferred since "
+    "multisite quadruples number of shards to reduce need to reshaard";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 650000, 700,
+					       needs_resharding, &suggested_shard_count, true);
+  // 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
+  // 50000 = 13
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(13, suggested_shard_count) << "shard reduction without hitting min_layout_shards";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 350000, 400,
+					       needs_resharding, &suggested_shard_count, true);
+  // 350,000 objs across 400 shards -> <1000 objs per shard; 350000 /
+  // 50000 = 7, but hard-coded minimum of 11
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(11, suggested_shard_count) << "shard reduction and hitting hard-coded minimum of 11";
+
+  RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 51, 100000, false, 650000, 700,
+					       needs_resharding, &suggested_shard_count, true);
+  // 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
+  // 50000 = 13, but bucket min of 51
+  ASSERT_EQ(true, needs_resharding);
+  ASSERT_EQ(51, suggested_shard_count) << "shard reduction and hitting min_layout_shards";
 }
-- 
2.47.3