]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw/pubsub: fix uninitialized num_shards causing topic deletion hang 67798/head
authorOguzhan Ozmen <oozmen@bloomberg.net>
Fri, 13 Mar 2026 21:56:18 +0000 (21:56 +0000)
committerOguzhan Ozmen <oozmen@bloomberg.net>
Mon, 16 Mar 2026 15:13:40 +0000 (15:13 +0000)
The num_shards member of rgw_pubsub_dest was not included in JSON
serialization (dump/decode_json), causing garbage values when topic
metadata synced between zones. This resulted in topic deletion
iterating millions of times over non-existent shards, blocking
frontend pause during realm reload for extended periods.

Fixes: https://tracker.ceph.com/issues/75466
Signed-off-by: Oguzhan Ozmen <oozmen@bloomberg.net>
src/rgw/rgw_pubsub.cc
src/rgw/rgw_pubsub.h

index 68e6e0b62aaf33b1e69eb09b1f7b96bf50b46e89..2a0993561ae489606cbeb79b48d148e36d2939d7 100644 (file)
@@ -299,6 +299,7 @@ void rgw_pubsub_dest::dump(Formatter *f) const
   encode_json("time_to_live", time_to_live!=DEFAULT_GLOBAL_VALUE? std::to_string(time_to_live): DEFAULT_CONFIG, f);
   encode_json("max_retries", max_retries!=DEFAULT_GLOBAL_VALUE? std::to_string(max_retries): DEFAULT_CONFIG, f);
   encode_json("retry_sleep_duration", retry_sleep_duration!=DEFAULT_GLOBAL_VALUE? std::to_string(retry_sleep_duration): DEFAULT_CONFIG, f);
+  encode_json("num_shards", num_shards, f);
 }
 
 void rgw_pubsub_dest::dump_xml(Formatter *f) const
@@ -358,6 +359,10 @@ void rgw_pubsub_dest::decode_json(JSONObj* f) {
   retry_sleep_duration = sleep_dur == DEFAULT_CONFIG ? DEFAULT_GLOBAL_VALUE
                                                      : std::stoul(sleep_dur);
 
+  // if no "num_shards" field found in the JSON blob, num_shards defaults to 1
+  // (in the member initializer) for backward compatibility with pre-sharding
+  // persistent topics. Non-persistent topics don't use "num_shards".
+  JSONDecoder::decode_json("num_shards", num_shards, f);
 }
 
 ShardNamesView rgw_pubsub_dest::get_shard_names() const {
index d724324b2b06412db6aa0f456534bb23220542d4..e2428dde78a0776bfcb5fad41cccc66f85ba9d6f 100644 (file)
@@ -266,7 +266,7 @@ struct rgw_pubsub_dest {
   uint32_t max_retries;
   uint32_t retry_sleep_duration;
   // naming convention of sharded queues in the 'notif' pool -> persistent_queue, persistent_queue.1, persistent_queue.(num_shards -1)...
-  uint64_t num_shards; //defaults to a single shard for now, for backward compatibility
+  uint64_t num_shards = 1; // Default to 1 shard for backward compatibility with pre-sharding persistent topics.
 
 
   void encode(bufferlist& bl) const {
@@ -304,9 +304,6 @@ struct rgw_pubsub_dest {
     if (struct_v >= 5) {
       decode(persistent, bl);
     }
-    else {
-      num_shards = persistent ? 1 : 0; //defaults to a single shard for backward compatibility
-    }
     if (struct_v >= 6) {
       decode(time_to_live, bl);
       decode(max_retries, bl);
@@ -320,7 +317,8 @@ struct rgw_pubsub_dest {
       // continue to use 'arn_topic' alone as the queue's rados object name
       persistent_queue = arn_topic;
     }
-    if (struct_v >= 8) { 
+    if (struct_v >= 8) {
+      // for struct_v < 8, num_shards defaults to 1 (single shard for pre-sharding persistent topics)
       decode(num_shards, bl);
     }