rgw: refactor selected files for better above- vs below-the-line

author Kaleb S. KEITHLEY <kkeithle@redhat.com>

Sat, 21 Jan 2023 17:37:05 +0000 (12:37 -0500)

committer Kaleb S. KEITHLEY <kkeithle@redhat.com>

Mon, 23 Jan 2023 14:11:27 +0000 (09:11 -0500)
author Kaleb S. KEITHLEY <kkeithle@redhat.com>
Sat, 21 Jan 2023 17:37:05 +0000 (12:37 -0500)
committer Kaleb S. KEITHLEY <kkeithle@redhat.com>
Mon, 23 Jan 2023 14:11:27 +0000 (09:11 -0500)
diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt

index 7b13abfb57735c1d3c6084ad37c0853e9928af77..3f792bf8e069b5966ae3ccbc3a4f98a69f14dd56 100644 (file)
--- a/src/rgw/CMakeLists.txt
+++ b/src/rgw/CMakeLists.txt
@@ -84,31 +84,24 @@ set(librgw_common_srcs
    rgw_multipart_meta_filter.cc
    rgw_obj_manifest.cc
    rgw_period.cc
-  rgw_pubsub.cc
    rgw_realm.cc
    rgw_sync.cc
    rgw_sync_policy.cc
-  rgw_pubsub_push.cc
    rgw_notify_event_type.cc
    rgw_period_history.cc
    rgw_period_puller.cc
-  rgw_reshard.cc
    rgw_coroutine.cc
    rgw_cr_rest.cc
    rgw_op.cc
    rgw_policy_s3.cc
    rgw_public_access.cc
    rgw_putobj.cc
-  rgw_putobj_processor.cc
    rgw_quota.cc
-  rgw_rados.cc
    rgw_resolve.cc
    rgw_rest.cc
-  rgw_rest_bucket.cc
    rgw_rest_client.cc
    rgw_rest_config.cc
    rgw_rest_conn.cc
-  rgw_rest_log.cc
    rgw_rest_metadata.cc
    rgw_rest_ratelimit.cc
    rgw_rest_role.cc
@@ -168,6 +161,13 @@ set(librgw_common_srcs
    driver/rados/rgw_object_expirer_core.cc
    driver/rados/rgw_otp.cc
    driver/rados/rgw_period.cc
+  driver/rados/rgw_pubsub.cc
+  driver/rados/rgw_pubsub_push.cc
+  driver/rados/rgw_putobj_processor.cc
+  driver/rados/rgw_rados.cc
+  driver/rados/rgw_reshard.cc
+  driver/rados/rgw_rest_bucket.cc
+  driver/rados/rgw_rest_log.cc
    driver/rados/rgw_rest_pubsub.cc
    driver/rados/rgw_rest_realm.cc
    driver/rados/rgw_rest_user.cc
@@ -345,20 +345,20 @@ set(rgw_a_srcs
    rgw_process.cc
    rgw_realm_reloader.cc
    rgw_realm_watcher.cc
-  rgw_rest_bucket.cc
    rgw_rest_config.cc
    rgw_rest_info.cc
-  rgw_rest_log.cc
    rgw_rest_metadata.cc
    rgw_rest_ratelimit.cc
-  driver/rados/rgw_rest_realm.cc
    rgw_rest_sts.cc
    rgw_rest_swift.cc
    rgw_rest_usage.cc
    rgw_signal.cc
    rgw_swift_auth.cc
    rgw_usage.cc
-  rgw_sts.cc)
+  rgw_sts.cc
+  driver/rados/rgw_rest_bucket.cc
+  driver/rados/rgw_rest_log.cc
+  driver/rados/rgw_rest_realm.cc)
  
  gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf
    rgw_iam_policy_keywords.frag.cc)
diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h

index 41aae8d8dfcae8b3491a283a719f3d1611c82d62..b4a7ca057b388908955523a3616f5010ba3f3928 100644 (file)
--- a/src/rgw/driver/dbstore/common/dbstore.h
+++ b/src/rgw/driver/dbstore/common/dbstore.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab
  
-#ifndef DB_STORE_H
-#define DB_STORE_H
+#pragma once
  
  #include <errno.h>
  #include <stdlib.h>
@@ -2016,5 +2015,3 @@ struct db_get_obj_data {
  };
  
  } } // namespace rgw::store
-
-#endif
diff --git a/src/rgw/driver/dbstore/common/dbstore_log.h b/src/rgw/driver/dbstore/common/dbstore_log.h

index 8d981d5adc41980d757677b47f1b9212f12aba90..416508369ef96a8d8106d521e5110fe82234809f 100644 (file)
--- a/src/rgw/driver/dbstore/common/dbstore_log.h
+++ b/src/rgw/driver/dbstore/common/dbstore_log.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab
  
-#ifndef DB_STORE_LOG_H
-#define DB_STORE_LOG_H
+#pragma once
  
  #include <cerrno>
  #include <cstdlib>
@@ -14,5 +13,3 @@
  
  #undef dout_prefix
  #define dout_prefix *_dout << "rgw dbstore: "
-
-#endif
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.h b/src/rgw/driver/dbstore/sqlite/sqliteDB.h

index 4f651448a994808788e510bf88b899f4de8478ec..ec0ef2bb28269b2848f5a0b61786a62818cb40d7 100644 (file)
--- a/src/rgw/driver/dbstore/sqlite/sqliteDB.h
+++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab
  
-#ifndef SQLITE_DB_H
-#define SQLITE_DB_H
+#pragma once
  
  #include <errno.h>
  #include <stdlib.h>
@@ -550,5 +549,3 @@ class SQLGetLCHead : public SQLiteDB, public GetLCHeadOp {
      int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
      int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
  };
-
-#endif
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.h b/src/rgw/driver/rados/cls_fifo_legacy.h

index 3ea54082d8877f32170cb1ca4d71b80210f2ba43..b0a68157e619582bc0e7caa720708a4966400570 100644 (file)
--- a/src/rgw/driver/rados/cls_fifo_legacy.h
+++ b/src/rgw/driver/rados/cls_fifo_legacy.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef CEPH_RGW_CLS_FIFO_LEGACY_H
-#define CEPH_RGW_CLS_FIFO_LEGACY_H
+#pragma once
  
  #include <cstdint>
  #include <deque>
@@ -333,5 +332,3 @@ public:
  };
  
  }
-
-#endif // CEPH_RGW_CLS_FIFO_LEGACY_H
diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h

index 3451376ee6eaa752d7047b8185bef5b7064e3470..f17061b37ea29b1f583df1e1112b97189966cc1f 100644 (file)
--- a/src/rgw/driver/rados/rgw_cr_rados.h
+++ b/src/rgw/driver/rados/rgw_cr_rados.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_CR_RADOS_H
-#define CEPH_RGW_CR_RADOS_H
+#pragma once
  
  #include <boost/intrusive_ptr.hpp>
  #include "include/ceph_assert.h"
@@ -1640,4 +1639,3 @@ public:
    int operate(const DoutPrefixProvider* dpp) override;
  };
  
-#endif
diff --git a/src/rgw/driver/rados/rgw_cr_tools.h b/src/rgw/driver/rados/rgw_cr_tools.h

index ebdbfeb51b7f2b193f680c8e45377d9aeec75056..4cd97aa82f5186e219d81301375c5355de14feb1 100644 (file)
--- a/src/rgw/driver/rados/rgw_cr_tools.h
+++ b/src/rgw/driver/rados/rgw_cr_tools.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_CR_TOOLS_H
-#define CEPH_RGW_CR_TOOLS_H
+#pragma once
  
  #include "rgw_cr_rados.h"
  #include "rgw_tools.h"
@@ -84,4 +83,3 @@ struct rgw_bucket_get_sync_policy_result {
  
  using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR<rgw_bucket_get_sync_policy_params, rgw_bucket_get_sync_policy_result>;
  
-#endif
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.h b/src/rgw/driver/rados/rgw_d3n_datacache.h

index 5d3537f3b148479533835960867f193c2e3ab03e..98e61b63fe303433f5076ffd3b37a0531a79de86 100644 (file)
--- a/src/rgw/driver/rados/rgw_d3n_datacache.h
+++ b/src/rgw/driver/rados/rgw_d3n_datacache.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGWD3NDATACACHE_H
-#define CEPH_RGWD3NDATACACHE_H
+#pragma once
  
  #include "rgw_rados.h"
  #include <curl/curl.h>
@@ -258,4 +257,3 @@ int D3nRGWDataCache<T>::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const
    return 0;
  }
  
-#endif
diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h

index ccaa20884eb561a8328fef36553a6c5ecc01bcc8..9059bd14cce7aed8f344db6fa049556bcae07e9a 100644 (file)
--- a/src/rgw/driver/rados/rgw_data_sync.h
+++ b/src/rgw/driver/rados/rgw_data_sync.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_DATA_SYNC_H
-#define CEPH_RGW_DATA_SYNC_H
+#pragma once
  
  #include <fmt/format.h>
  #include <fmt/ostream.h>
@@ -867,5 +866,3 @@ public:
    bool supports_data_export() override { return false; }
    int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
  };
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h

index 695485bf2e41cb3eb785f0a46571c82226309070..1c9a00c1fffb956fe58d60874d4c2ffbb97f5be6 100644 (file)
--- a/src/rgw/driver/rados/rgw_datalog.h
+++ b/src/rgw/driver/rados/rgw_datalog.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_DATALOG_H
-#define CEPH_RGW_DATALOG_H
+#pragma once
  
  #include <cstdint>
  #include <list>
@@ -379,6 +378,3 @@ public:
    // 1 on empty, 0 on non-empty, negative on error.
    virtual int is_empty(const DoutPrefixProvider *dpp) = 0;
  };
-
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.h b/src/rgw/driver/rados/rgw_etag_verifier.h

index 56a679ebddd31d1a15a31e0991590a379c665e78..18a4f5a3fb6356bdfa03f5e3cca17fd38b14c0c9 100644 (file)
--- a/src/rgw/driver/rados/rgw_etag_verifier.h
+++ b/src/rgw/driver/rados/rgw_etag_verifier.h
@@ -12,8 +12,8 @@
   * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag
   * on the MPU parts.
   */
-#ifndef CEPH_RGW_ETAG_VERIFIER_H
-#define CEPH_RGW_ETAG_VERIFIER_H
+
+#pragma once
  
  #include "rgw_putobj.h"
  #include "rgw_op.h"
@@ -88,5 +88,3 @@ int create_etag_verifier(const DoutPrefixProvider *dpp,
                           etag_verifier_ptr& verifier);
  
  } // namespace rgw::putobj
-
-#endif /* CEPH_RGW_ETAG_VERIFIER_H */
diff --git a/src/rgw/driver/rados/rgw_gc.h b/src/rgw/driver/rados/rgw_gc.h

index 196f2802c1649a2c4d84232695e31147f172bb16..f3df64099a1cdf8c1e6f4d8c7ab07b97af244dee 100644 (file)
--- a/src/rgw/driver/rados/rgw_gc.h
+++ b/src/rgw/driver/rados/rgw_gc.h
@@ -1,9 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_GC_H
-#define CEPH_RGW_GC_H
-
+#pragma once
  
  #include "include/types.h"
  #include "include/rados/librados.hpp"
@@ -82,6 +80,3 @@ public:
    std::ostream& gen_prefix(std::ostream& out) const;
  
  };
-
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h

index 1b21f262092af8373d30742fdfcaddd856a33d68..729c4c304cd62ba06d7e4840b589cd60e82dc7f4 100644 (file)
--- a/src/rgw/driver/rados/rgw_lc_tier.h
+++ b/src/rgw/driver/rados/rgw_lc_tier.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_LC_TIER_H
-#define CEPH_RGW_LC_TIER_H
+#pragma once
  
  #include "rgw_lc.h"
  #include "rgw_rest_conn.h"
@@ -50,5 +49,3 @@ struct RGWLCCloudTierCtx {
  
  /* Transition object to cloud endpoint */
  int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_log_backing.h b/src/rgw/driver/rados/rgw_log_backing.h

index a431574c46fa89233fad73d4e4fba227dcddecd1..3dfdb8ee4ef11b342a88658bb80b472cd27edc6b 100644 (file)
--- a/src/rgw/driver/rados/rgw_log_backing.h
+++ b/src/rgw/driver/rados/rgw_log_backing.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_LOGBACKING_H
-#define CEPH_RGW_LOGBACKING_H
+#pragma once
  
  #include <optional>
  #include <iostream>
@@ -393,5 +392,3 @@ public:
      return 0;
    }
  };
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_metadata.h b/src/rgw/driver/rados/rgw_metadata.h

index 72283702e7e138baf964ecca5757a3adb9405cba..c83db7c40437bf041eced51fc0de057c75bc6a87 100644 (file)
--- a/src/rgw/driver/rados/rgw_metadata.h
+++ b/src/rgw/driver/rados/rgw_metadata.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_METADATA_H
-#define CEPH_RGW_METADATA_H
+#pragma once
  
  #include <string>
  #include <utility>
@@ -297,4 +296,3 @@ void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::s
  void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
  void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
  
-#endif
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.h b/src/rgw/driver/rados/rgw_object_expirer_core.h

index fccd4199e7ea93742fba07f1ebf2dc04ebbabeb5..be63815c19edc44dd631098c25f18248cd7f60ca 100644 (file)
--- a/src/rgw/driver/rados/rgw_object_expirer_core.h
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_OBJEXP_H
-#define CEPH_OBJEXP_H
+#pragma once
  
  #include <atomic>
  #include <string>
@@ -145,4 +144,3 @@ public:
    void start_processor();
    void stop_processor();
  };
-#endif /* CEPH_OBJEXP_H */
diff --git a/src/rgw/driver/rados/rgw_otp.h b/src/rgw/driver/rados/rgw_otp.h

index eacff15314c21d2fb9c31bcafe4392ee5a7c4f50..885e8abb8e1d74d0f93fba05734f0120d33abddc 100644 (file)
--- a/src/rgw/driver/rados/rgw_otp.h
+++ b/src/rgw/driver/rados/rgw_otp.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_OTP_H
-#define CEPH_RGW_OTP_H
+#pragma once
  
  #include "rgw_sal_fwd.h"
  #include "cls/otp/cls_otp_types.h"
@@ -109,6 +108,3 @@ public:
                   const rgw_user& user, optional_yield y,
                   const RemoveParams& params = {});
  };
-
-#endif
-
diff --git a/src/rgw/driver/rados/rgw_pubsub.cc b/src/rgw/driver/rados/rgw_pubsub.cc

new file mode 100644 (file)

index 0000000..b9aa54b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub.cc
@@ -0,0 +1,723 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "services/svc_zone.h"
+#include "rgw_b64.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_pubsub.h"
+#include "rgw_tools.h"
+#include "rgw_xml.h"
+#include "rgw_arn.h"
+#include "rgw_pubsub_push.h"
+#include <regex>
+#include <algorithm>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) {
+  char buf[64];
+  const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str());
+  if (len > 0) {
+    id.assign(buf, len);
+  }
+}
+
+bool rgw_s3_key_filter::decode_xml(XMLObj* obj) {
+  XMLObjIter iter = obj->find("FilterRule");
+  XMLObj *o;
+
+  const auto throw_if_missing = true;
+  auto prefix_not_set = true;
+  auto suffix_not_set = true;
+  auto regex_not_set = true;
+  std::string name;
+
+  while ((o = iter.get_next())) {
+    RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing);
+    if (name == "prefix" && prefix_not_set) {
+        prefix_not_set = false;
+        RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing);
+    } else if (name == "suffix" && suffix_not_set) {
+        suffix_not_set = false;
+        RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing);
+    } else if (name == "regex" && regex_not_set) {
+        regex_not_set = false;
+        RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing);
+    } else {
+        throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'");
+    }
+  }
+  return true;
+}
+
+void rgw_s3_key_filter::dump_xml(Formatter *f) const {
+  if (!prefix_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", "prefix", f);
+    ::encode_xml("Value", prefix_rule, f);
+    f->close_section();
+  }
+  if (!suffix_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", "suffix", f);
+    ::encode_xml("Value", suffix_rule, f);
+    f->close_section();
+  }
+  if (!regex_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", "regex", f);
+    ::encode_xml("Value", regex_rule, f);
+    f->close_section();
+  }
+}
+
+bool rgw_s3_key_filter::has_content() const {
+    return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty());
+}
+
+bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) {
+  kv.clear();
+  XMLObjIter iter = obj->find("FilterRule");
+  XMLObj *o;
+
+  const auto throw_if_missing = true;
+
+  std::string key;
+  std::string value;
+
+  while ((o = iter.get_next())) {
+    RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing);
+    RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing);
+    kv.emplace(key, value);
+  }
+  return true;
+}
+
+void rgw_s3_key_value_filter::dump_xml(Formatter *f) const {
+  for (const auto& key_value : kv) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", key_value.first, f);
+    ::encode_xml("Value", key_value.second, f);
+    f->close_section();
+  }
+}
+
+bool rgw_s3_key_value_filter::has_content() const {
+    return !kv.empty();
+}
+
+bool rgw_s3_filter::decode_xml(XMLObj* obj) {
+    RGWXMLDecoder::decode_xml("S3Key", key_filter, obj);
+    RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj);
+    RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj);
+  return true;
+}
+
+void rgw_s3_filter::dump_xml(Formatter *f) const {
+  if (key_filter.has_content()) {
+      ::encode_xml("S3Key", key_filter, f);
+  }
+  if (metadata_filter.has_content()) {
+      ::encode_xml("S3Metadata", metadata_filter, f);
+  }
+  if (tag_filter.has_content()) {
+      ::encode_xml("S3Tags", tag_filter, f);
+  }
+}
+
+bool rgw_s3_filter::has_content() const {
+    return key_filter.has_content()  ||
+           metadata_filter.has_content() ||
+           tag_filter.has_content();
+}
+
+bool match(const rgw_s3_key_filter& filter, const std::string& key) {
+  const auto key_size = key.size();
+  const auto prefix_size = filter.prefix_rule.size();
+  if (prefix_size != 0) {
+    // prefix rule exists
+    if (prefix_size > key_size) {
+      // if prefix is longer than key, we fail
+      return false;
+    }
+    if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) {
+        return false;
+    }
+  }
+  const auto suffix_size = filter.suffix_rule.size();
+  if (suffix_size != 0) {
+    // suffix rule exists
+    if (suffix_size > key_size) {
+      // if suffix is longer than key, we fail
+      return false;
+    }
+    if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) {
+        return false;
+    }
+  }
+  if (!filter.regex_rule.empty()) {
+    // TODO add regex chaching in the filter
+    const std::regex base_regex(filter.regex_rule);
+    if (!std::regex_match(key, base_regex)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) {
+  // all filter pairs must exist with the same value in the object's metadata/tags
+  // object metadata/tags may include items not in the filter
+  return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end());
+}
+
+bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) {
+  // all filter pairs must exist with the same value in the object's metadata/tags
+  // object metadata/tags may include items not in the filter
+  for (auto& filter : filter.kv) {
+    auto result = kv.equal_range(filter.first);
+    if (std::any_of(result.first, result.second, [&filter](const pair<string,string>& p) { return p.second == filter.second;}))
+      continue;
+    else
+      return false;
+  }
+  return true;
+}
+
+bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) {
+  // if event list exists, and none of the events in the list matches the event type, filter the message
+  if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) {
+    return false;
+  }
+  return true;
+}
+
+void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) {
+  l.clear();
+
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o;
+
+  while ((o = iter.get_next())) {
+    std::string val;
+    decode_xml_obj(val, o);
+    l.push_back(rgw::notify::from_string(val));
+  }
+}
+
+bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) {
+  const auto throw_if_missing = true;
+  RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing);
+  
+  RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing);
+  
+  RGWXMLDecoder::decode_xml("Filter", filter, obj);
+
+  do_decode_xml_obj(events, "Event", obj);
+  if (events.empty()) {
+    // if no events are provided, we assume all events
+    events.push_back(rgw::notify::ObjectCreated);
+    events.push_back(rgw::notify::ObjectRemoved);
+  }
+  return true;
+}
+
+void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const {
+  ::encode_xml("Id", id, f);
+  ::encode_xml("Topic", topic_arn.c_str(), f);
+  if (filter.has_content()) {
+      ::encode_xml("Filter", filter, f);
+  }
+  for (const auto& event : events) {
+    ::encode_xml("Event", rgw::notify::to_string(event), f);
+  }
+}
+
+bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) {
+  do_decode_xml_obj(list, "TopicConfiguration", obj);
+  return true;
+}
+
+rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) :
+    id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} 
+
+void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const {
+  do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f);
+}
+
+void rgw_pubsub_s3_event::dump(Formatter *f) const {
+  encode_json("eventVersion", eventVersion, f);
+  encode_json("eventSource", eventSource, f);
+  encode_json("awsRegion", awsRegion, f);
+  utime_t ut(eventTime);
+  encode_json("eventTime", ut, f);
+  encode_json("eventName", eventName, f);
+  {
+    Formatter::ObjectSection s(*f, "userIdentity");
+    encode_json("principalId", userIdentity, f);
+  }
+  {
+    Formatter::ObjectSection s(*f, "requestParameters");
+    encode_json("sourceIPAddress", sourceIPAddress, f);
+  }
+  {
+    Formatter::ObjectSection s(*f, "responseElements");
+    encode_json("x-amz-request-id", x_amz_request_id, f);
+    encode_json("x-amz-id-2", x_amz_id_2, f);
+  }
+  {
+    Formatter::ObjectSection s(*f, "s3");
+    encode_json("s3SchemaVersion", s3SchemaVersion, f);
+    encode_json("configurationId", configurationId, f);
+    {
+        Formatter::ObjectSection sub_s(*f, "bucket");
+        encode_json("name", bucket_name, f);
+        {
+            Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity");
+            encode_json("principalId", bucket_ownerIdentity, f);
+        }
+        encode_json("arn", bucket_arn, f);
+        encode_json("id", bucket_id, f);
+    }
+    {
+        Formatter::ObjectSection sub_s(*f, "object");
+        encode_json("key", object_key, f);
+        encode_json("size", object_size, f);
+        encode_json("eTag", object_etag, f);
+        encode_json("versionId", object_versionId, f);
+        encode_json("sequencer", object_sequencer, f);
+        encode_json("metadata", x_meta_map, f);
+        encode_json("tags", tags, f);
+    }
+  }
+  encode_json("eventId", id, f);
+  encode_json("opaqueData", opaque_data, f);
+}
+
+void rgw_pubsub_topic::dump(Formatter *f) const
+{
+  encode_json("user", user, f);
+  encode_json("name", name, f);
+  encode_json("dest", dest, f);
+  encode_json("arn", arn, f);
+  encode_json("opaqueData", opaque_data, f);
+}
+
+void rgw_pubsub_topic::dump_xml(Formatter *f) const
+{
+  encode_xml("User", user, f);
+  encode_xml("Name", name, f);
+  encode_xml("EndPoint", dest, f);
+  encode_xml("TopicArn", arn, f);
+  encode_xml("OpaqueData", opaque_data, f);
+}
+
+void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) {
+  f->open_object_section("entry");
+  encode_xml("key", key, f);
+  encode_xml("value", value, f);
+  f->close_section(); // entry
+}
+
+void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const
+{
+  f->open_array_section("Attributes");
+  std::string str_user;
+  user.to_str(str_user);
+  encode_xml_key_value_entry("User", str_user, f);
+  encode_xml_key_value_entry("Name", name, f);
+  encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f);
+  encode_xml_key_value_entry("TopicArn", arn, f);
+  encode_xml_key_value_entry("OpaqueData", opaque_data, f);
+  f->close_section(); // Attributes
+}
+
+void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    f->dump_string("obj", rgw::notify::to_string(*iter));
+  }
+  f->close_section();
+}
+
+void rgw_pubsub_topic_filter::dump(Formatter *f) const
+{
+  encode_json("topic", topic, f);
+  encode_json("events", events, f);
+}
+
+void rgw_pubsub_topic_subs::dump(Formatter *f) const
+{
+  encode_json("topic", topic, f);
+  encode_json("subs", subs, f);
+}
+
+void rgw_pubsub_bucket_topics::dump(Formatter *f) const
+{
+  Formatter::ArraySection s(*f, "topics");
+  for (auto& t : topics) {
+    encode_json(t.first.c_str(), t.second, f);
+  }
+}
+
+void rgw_pubsub_topics::dump(Formatter *f) const
+{
+  Formatter::ArraySection s(*f, "topics");
+  for (auto& t : topics) {
+    encode_json(t.first.c_str(), t.second, f);
+  }
+}
+
+void rgw_pubsub_topics::dump_xml(Formatter *f) const
+{
+  for (auto& t : topics) {
+    encode_xml("member", t.second.topic, f);
+  }
+}
+
+void rgw_pubsub_sub_dest::dump(Formatter *f) const
+{
+  encode_json("bucket_name", bucket_name, f);
+  encode_json("oid_prefix", oid_prefix, f);
+  encode_json("push_endpoint", push_endpoint, f);
+  encode_json("push_endpoint_args", push_endpoint_args, f);
+  encode_json("push_endpoint_topic", arn_topic, f);
+  encode_json("stored_secret", stored_secret, f);
+  encode_json("persistent", persistent, f);
+}
+
+void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const
+{
+  // first 2 members are omitted here since they
+  // dont apply to AWS compliant topics
+  encode_xml("EndpointAddress", push_endpoint, f);
+  encode_xml("EndpointArgs", push_endpoint_args, f);
+  encode_xml("EndpointTopic", arn_topic, f);
+  encode_xml("HasStoredSecret", stored_secret, f);
+  encode_xml("Persistent", persistent, f);
+}
+
+std::string rgw_pubsub_sub_dest::to_json_str() const
+{
+  // first 2 members are omitted here since they
+  // dont apply to AWS compliant topics
+  JSONFormatter f;
+  f.open_object_section("");
+  encode_json("EndpointAddress", push_endpoint, &f);
+  encode_json("EndpointArgs", push_endpoint_args, &f);
+  encode_json("EndpointTopic", arn_topic, &f);
+  encode_json("HasStoredSecret", stored_secret, &f);
+  encode_json("Persistent", persistent, &f);
+  f.close_section();
+  std::stringstream ss;
+  f.flush(ss);
+  return ss.str();
+}
+
+void rgw_pubsub_sub_config::dump(Formatter *f) const
+{
+  encode_json("user", user, f);
+  encode_json("name", name, f);
+  encode_json("topic", topic, f);
+  encode_json("dest", dest, f);
+  encode_json("s3_id", s3_id, f);
+}
+
+RGWPubSub::RGWPubSub(rgw::sal::RadosStore* _store, const std::string& _tenant)
+  : store(_store), tenant(_tenant), svc_sysobj(store->svc()->sysobj)
+{
+  get_meta_obj(&meta_obj);
+}
+
+int RGWPubSub::remove(const DoutPrefixProvider *dpp, 
+                          const rgw_raw_obj& obj,
+                         RGWObjVersionTracker *objv_tracker,
+                         optional_yield y)
+{
+  int ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, obj.pool, obj.oid, objv_tracker, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker *objv_tracker)
+{
+  int ret = read(meta_obj, result, objv_tracker);
+  if (ret < 0) {
+    ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
+                                    RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+  int ret = write(dpp, meta_obj, topics, objv_tracker, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWPubSub::get_topics(rgw_pubsub_topics *result)
+{
+  return read_topics(result, nullptr);
+}
+
+int RGWPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker)
+{
+  int ret = ps->read(bucket_meta_obj, result, objv_tracker);
+  if (ret < 0 && ret != -ENOENT) {
+    ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
+                                       RGWObjVersionTracker *objv_tracker,
+                                       optional_yield y)
+{
+  int ret = ps->write(dpp, bucket_meta_obj, topics, objv_tracker, y);
+  if (ret < 0) {
+    ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result)
+{
+  return read_topics(result, nullptr);
+}
+
+int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result)
+{
+  rgw_pubsub_topics topics;
+  int ret = get_topics(&topics);
+  if (ret < 0) {
+    ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  auto iter = topics.topics.find(name);
+  if (iter == topics.topics.end()) {
+    ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
+    return -ENOENT;
+  }
+
+  *result = iter->second;
+  return 0;
+}
+
+int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic *result)
+{
+  rgw_pubsub_topics topics;
+  int ret = get_topics(&topics);
+  if (ret < 0) {
+    ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  auto iter = topics.topics.find(name);
+  if (iter == topics.topics.end()) {
+    ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
+    return -ENOENT;
+  }
+
+  *result = iter->second.topic;
+  return 0;
+}
+
+int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y) {
+  return create_notification(dpp, topic_name, events, std::nullopt, "", y);
+}
+
+int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name,const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) {
+  rgw_pubsub_topic_subs topic_info;
+
+  int ret = ps->get_topic(topic_name, &topic_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl;
+
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_bucket_topics bucket_topics;
+
+  ret = read_topics(&bucket_topics, &objv_tracker);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" << 
+      bucket.name << "': ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << 
+    bucket.name << "'" << dendl;
+
+  auto& topic_filter = bucket_topics.topics[topic_name];
+  topic_filter.topic = topic_info.topic;
+  topic_filter.events = events;
+  topic_filter.s3_id = notif_name;
+  if (s3_filter) {
+    topic_filter.s3_filter = *s3_filter;
+  }
+
+  ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl;
+    return ret;
+  }
+    
+  ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl;
+
+  return 0;
+}
+
+int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const string& topic_name, optional_yield y)
+{
+  rgw_pubsub_topic_subs topic_info;
+
+  int ret = ps->get_topic(topic_name, &topic_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topic info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_bucket_topics bucket_topics;
+
+  ret = read_topics(&bucket_topics, &objv_tracker);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  bucket_topics.topics.erase(topic_name);
+
+  if (bucket_topics.topics.empty()) {
+    // no more topics - delete the notification object of the bucket
+    ret = ps->remove(dpp, bucket_meta_obj, &objv_tracker, y);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+
+  // write back the notifications without the deleted one
+  ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  // get all topics on a bucket
+  rgw_pubsub_bucket_topics bucket_topics;
+  auto ret  = get_topics(&bucket_topics);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket.name << "', ret=" << ret << dendl;
+    return ret ;
+  }
+
+  // remove all auto-genrated topics
+  for (const auto& topic : bucket_topics.topics) {
+    const auto& topic_name = topic.first;
+    ret = ps->remove_topic(dpp, topic_name, y);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl;
+    }
+  }
+
+  // delete the notification object of the bucket
+  ret = ps->remove(dpp, bucket_meta_obj, nullptr, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) {
+  return create_topic(dpp, name, rgw_pubsub_sub_dest(), "", "", y);
+}
+
+int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y) {
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_topics topics;
+
+  int ret = read_topics(&topics, &objv_tracker);
+  if (ret < 0 && ret != -ENOENT) {
+    // its not an error if not topics exist, we create one
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+ 
+  rgw_pubsub_topic_subs& new_topic = topics.topics[name];
+  new_topic.topic.user = rgw_user("", tenant);
+  new_topic.topic.name = name;
+  new_topic.topic.dest = dest;
+  new_topic.topic.arn = arn;
+  new_topic.topic.opaque_data = opaque_data;
+
+  ret = write_topics(dpp, topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y)
+{
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_topics topics;
+
+  int ret = read_topics(&topics, &objv_tracker);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  } else if (ret == -ENOENT) {
+      // its not an error if no topics exist, just a no-op
+      ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl;
+      return 0;
+  }
+
+  topics.topics.erase(name);
+
+  ret = write_topics(dpp, topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWPubSub::get_meta_obj(rgw_raw_obj *obj) const {
+  *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, meta_oid());
+}
+
+void RGWPubSub::get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const {
+  *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, bucket_meta_oid(bucket));
+}
+
+void RGWPubSub::get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const {
+  *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sub_meta_oid(name));
+}
+
diff --git a/src/rgw/driver/rados/rgw_pubsub.h b/src/rgw/driver/rados/rgw_pubsub.h

new file mode 100644 (file)

index 0000000..08a329e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub.h
@@ -0,0 +1,713 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "services/svc_sys_obj.h"
+#include "rgw_tools.h"
+#include "rgw_zone.h"
+#include "rgw_notify_event_type.h"
+#include <boost/container/flat_map.hpp>
+
+namespace rgw::sal { class RadosStore; }
+
+class XMLObj;
+
+struct rgw_s3_key_filter {
+  std::string prefix_rule;
+  std::string suffix_rule;
+  std::string regex_rule;
+
+  bool has_content() const;
+
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(prefix_rule, bl);
+    encode(suffix_rule, bl);
+    encode(regex_rule, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(prefix_rule, bl);
+    decode(suffix_rule, bl);
+    decode(regex_rule, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_s3_key_filter)
+
+using KeyValueMap = boost::container::flat_map<std::string, std::string>;
+using KeyMultiValueMap = std::multimap<std::string, std::string>;
+
+struct rgw_s3_key_value_filter {
+  KeyValueMap kv;
+  
+  bool has_content() const;
+  
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(kv, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(kv, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_s3_key_value_filter)
+
+struct rgw_s3_filter {
+  rgw_s3_key_filter key_filter;
+  rgw_s3_key_value_filter metadata_filter;
+  rgw_s3_key_value_filter tag_filter;
+
+  bool has_content() const;
+  
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(key_filter, bl);
+    encode(metadata_filter, bl);
+    encode(tag_filter, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(key_filter, bl);
+    decode(metadata_filter, bl);
+    if (struct_v >= 2) {
+        decode(tag_filter, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_s3_filter)
+
+using OptionalFilter = std::optional<rgw_s3_filter>;
+
+struct rgw_pubsub_topic_filter;
+/* S3 notification configuration
+ * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html
+<NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+  <TopicConfiguration>
+    <Filter>
+      <S3Key>
+        <FilterRule>
+          <Name>suffix</Name>
+          <Value>jpg</Value>
+        </FilterRule>
+      </S3Key>
+      <S3Metadata>
+        <FilterRule>
+          <Name></Name>
+          <Value></Value>
+        </FilterRule>
+      </S3Metadata>
+      <S3Tags>
+        <FilterRule>
+          <Name></Name>
+          <Value></Value>
+        </FilterRule>
+      </S3Tags>
+    </Filter>
+    <Id>notification1</Id>
+    <Topic>arn:aws:sns:<region>:<account>:<topic></Topic>
+    <Event>s3:ObjectCreated:*</Event>
+    <Event>s3:ObjectRemoved:*</Event>
+  </TopicConfiguration>
+</NotificationConfiguration>
+*/
+struct rgw_pubsub_s3_notification {
+  // notification id
+  std::string id;
+  // types of events
+  rgw::notify::EventTypeList events;
+  // topic ARN
+  std::string topic_arn;
+  // filter rules
+  rgw_s3_filter filter;
+
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+
+  rgw_pubsub_s3_notification() = default;
+  // construct from rgw_pubsub_topic_filter (used by get/list notifications)
+  explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter);
+};
+
+// return true if the key matches the prefix/suffix/regex rules of the key filter
+bool match(const rgw_s3_key_filter& filter, const std::string& key);
+
+// return true if the key matches the metadata rules of the metadata filter
+bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv);
+
+// return true if the key matches the tag rules of the tag filter
+bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv);
+
+// return true if the event type matches (equal or contained in) one of the events in the list
+bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event);
+
+struct rgw_pubsub_s3_notifications {
+  std::list<rgw_pubsub_s3_notification> list;
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+
+/* S3 event records structure
+ * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
+{  
+"Records":[  
+  {
+    "eventVersion":""
+    "eventSource":"",
+    "awsRegion":"",
+    "eventTime":"",
+    "eventName":"",
+    "userIdentity":{  
+      "principalId":""
+    },
+    "requestParameters":{
+      "sourceIPAddress":""
+    },
+    "responseElements":{
+      "x-amz-request-id":"",
+      "x-amz-id-2":""
+    },
+    "s3":{
+      "s3SchemaVersion":"1.0",
+      "configurationId":"",
+      "bucket":{
+        "name":"",
+        "ownerIdentity":{
+          "principalId":""
+        },
+        "arn":""
+        "id": ""
+      },
+      "object":{
+        "key":"",
+        "size": ,
+        "eTag":"",
+        "versionId":"",
+        "sequencer": "",
+        "metadata": ""
+        "tags": ""
+      }
+    },
+    "eventId":"",
+  }
+]
+}*/
+
+struct rgw_pubsub_s3_event {
+  constexpr static const char* const json_type_plural = "Records";
+  std::string eventVersion = "2.2";
+  // aws:s3
+  std::string eventSource = "ceph:s3";
+  // zonegroup
+  std::string awsRegion;
+  // time of the request
+  ceph::real_time eventTime;
+  // type of the event
+  std::string eventName;
+  // user that sent the request
+  std::string userIdentity;
+  // IP address of source of the request (not implemented)
+  std::string sourceIPAddress;
+  // request ID (not implemented)
+  std::string x_amz_request_id;
+  // radosgw that received the request
+  std::string x_amz_id_2;
+  std::string s3SchemaVersion = "1.0";
+  // ID received in the notification request
+  std::string configurationId;
+  // bucket name
+  std::string bucket_name;
+  // bucket owner
+  std::string bucket_ownerIdentity;
+  // bucket ARN
+  std::string bucket_arn;
+  // object key
+  std::string object_key;
+  // object size
+  uint64_t object_size = 0;
+  // object etag
+  std::string object_etag;
+  // object version id bucket is versioned
+  std::string object_versionId;
+  // hexadecimal value used to determine event order for specific key
+  std::string object_sequencer;
+  // this is an rgw extension (not S3 standard)
+  // used to store a globally unique identifier of the event
+  // that could be used for acking or any other identification of the event
+  std::string id;
+  // this is an rgw extension holding the internal bucket id
+  std::string bucket_id;
+  // meta data
+  KeyValueMap x_meta_map;
+  // tags
+  KeyMultiValueMap tags;
+  // opaque data received from the topic
+  // could be used to identify the gateway
+  std::string opaque_data;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(4, 1, bl);
+    encode(eventVersion, bl);
+    encode(eventSource, bl);
+    encode(awsRegion, bl);
+    encode(eventTime, bl);
+    encode(eventName, bl);
+    encode(userIdentity, bl);
+    encode(sourceIPAddress, bl);
+    encode(x_amz_request_id, bl);
+    encode(x_amz_id_2, bl);
+    encode(s3SchemaVersion, bl);
+    encode(configurationId, bl);
+    encode(bucket_name, bl);
+    encode(bucket_ownerIdentity, bl);
+    encode(bucket_arn, bl);
+    encode(object_key, bl);
+    encode(object_size, bl);
+    encode(object_etag, bl);
+    encode(object_versionId, bl);
+    encode(object_sequencer, bl);
+    encode(id, bl);
+    encode(bucket_id, bl);
+    encode(x_meta_map, bl);
+    encode(tags, bl);
+    encode(opaque_data, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(4, bl);
+    decode(eventVersion, bl);
+    decode(eventSource, bl);
+    decode(awsRegion, bl);
+    decode(eventTime, bl);
+    decode(eventName, bl);
+    decode(userIdentity, bl);
+    decode(sourceIPAddress, bl);
+    decode(x_amz_request_id, bl);
+    decode(x_amz_id_2, bl);
+    decode(s3SchemaVersion, bl);
+    decode(configurationId, bl);
+    decode(bucket_name, bl);
+    decode(bucket_ownerIdentity, bl);
+    decode(bucket_arn, bl);
+    decode(object_key, bl);
+    decode(object_size, bl);
+    decode(object_etag, bl);
+    decode(object_versionId, bl);
+    decode(object_sequencer, bl);
+    decode(id, bl);
+    if (struct_v >= 2) {
+      decode(bucket_id, bl);
+      decode(x_meta_map, bl);
+    }
+    if (struct_v >= 3) {
+      decode(tags, bl);
+    }
+    if (struct_v >= 4) {
+      decode(opaque_data, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_s3_event)
+
+// setting a unique ID for an event based on object hash and timestamp
+void set_event_id(std::string& id, const std::string& hash, const utime_t& ts);
+
+struct rgw_pubsub_sub_dest {
+  std::string bucket_name;
+  std::string oid_prefix;
+  std::string push_endpoint;
+  std::string push_endpoint_args;
+  std::string arn_topic;
+  bool stored_secret = false;
+  bool persistent = false;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(5, 1, bl);
+    encode(bucket_name, bl);
+    encode(oid_prefix, bl);
+    encode(push_endpoint, bl);
+    encode(push_endpoint_args, bl);
+    encode(arn_topic, bl);
+    encode(stored_secret, bl);
+    encode(persistent, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(5, bl);
+    decode(bucket_name, bl);
+    decode(oid_prefix, bl);
+    decode(push_endpoint, bl);
+    if (struct_v >= 2) {
+        decode(push_endpoint_args, bl);
+    }
+    if (struct_v >= 3) {
+        decode(arn_topic, bl);
+    }
+    if (struct_v >= 4) {
+        decode(stored_secret, bl);
+    }
+    if (struct_v >= 5) {
+        decode(persistent, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  std::string to_json_str() const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest)
+
+struct rgw_pubsub_sub_config {
+  rgw_user user;
+  std::string name;
+  std::string topic;
+  rgw_pubsub_sub_dest dest;
+  std::string s3_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(user, bl);
+    encode(name, bl);
+    encode(topic, bl);
+    encode(dest, bl);
+    encode(s3_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(user, bl);
+    decode(name, bl);
+    decode(topic, bl);
+    decode(dest, bl);
+    if (struct_v >= 2) {
+      decode(s3_id, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_sub_config)
+
+struct rgw_pubsub_topic {
+  rgw_user user;
+  std::string name;
+  rgw_pubsub_sub_dest dest;
+  std::string arn;
+  std::string opaque_data;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(user, bl);
+    encode(name, bl);
+    encode(dest, bl);
+    encode(arn, bl);
+    encode(opaque_data, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(user, bl);
+    decode(name, bl);
+    if (struct_v >= 2) {
+      decode(dest, bl);
+      decode(arn, bl);
+    }
+    if (struct_v >= 3) {
+      decode(opaque_data, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  std::string to_str() const {
+    return user.tenant + "/" + name;
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  void dump_xml_as_attributes(Formatter *f) const;
+
+  bool operator<(const rgw_pubsub_topic& t) const {
+    return to_str().compare(t.to_str());
+  }
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic)
+
+struct rgw_pubsub_topic_subs {
+  rgw_pubsub_topic topic;
+  std::set<std::string> subs;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(topic, bl);
+    encode(subs, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(topic, bl);
+    decode(subs, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs)
+
+struct rgw_pubsub_topic_filter {
+  rgw_pubsub_topic topic;
+  rgw::notify::EventTypeList events;
+  std::string s3_id;
+  rgw_s3_filter s3_filter;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(topic, bl);
+    // events are stored as a vector of std::strings
+    std::vector<std::string> tmp_events;
+    std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string);
+    encode(tmp_events, bl);
+    encode(s3_id, bl);
+    encode(s3_filter, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(topic, bl);
+    // events are stored as a vector of std::strings
+    events.clear();
+    std::vector<std::string> tmp_events;
+    decode(tmp_events, bl);
+    std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string);
+    if (struct_v >= 2) {
+      decode(s3_id, bl);
+    }
+    if (struct_v >= 3) {
+      decode(s3_filter, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter)
+
+struct rgw_pubsub_bucket_topics {
+  std::map<std::string, rgw_pubsub_topic_filter> topics;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(topics, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(topics, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics)
+
+struct rgw_pubsub_topics {
+  std::map<std::string, rgw_pubsub_topic_subs> topics;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(topics, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(topics, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topics)
+
+static std::string pubsub_oid_prefix = "pubsub.";
+
+class RGWPubSub
+{
+  friend class Bucket;
+
+  rgw::sal::RadosStore* store;
+  const std::string tenant;
+  RGWSI_SysObj* svc_sysobj;
+
+  rgw_raw_obj meta_obj;
+
+  std::string meta_oid() const {
+    return pubsub_oid_prefix + tenant;
+  }
+
+  std::string bucket_meta_oid(const rgw_bucket& bucket) const {
+    return pubsub_oid_prefix + tenant + ".bucket." + bucket.name + "/" + bucket.marker;
+  }
+
+  std::string sub_meta_oid(const std::string& name) const {
+    return pubsub_oid_prefix + tenant + ".sub." + name;
+  }
+
+  template <class T>
+  int read(const rgw_raw_obj& obj, T* data, RGWObjVersionTracker* objv_tracker);
+
+  template <class T>
+  int write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
+           RGWObjVersionTracker* obj_tracker, optional_yield y);
+
+  int remove(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, RGWObjVersionTracker* objv_tracker,
+            optional_yield y);
+
+  int read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker* objv_tracker);
+  int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
+                       RGWObjVersionTracker* objv_tracker, optional_yield y);
+
+public:
+  RGWPubSub(rgw::sal::RadosStore* _store, const std::string& tenant);
+
+  class Bucket {
+    friend class RGWPubSub;
+    RGWPubSub *ps;
+    rgw_bucket bucket;
+    rgw_raw_obj bucket_meta_obj;
+
+    // read the list of topics associated with a bucket and populate into result
+    // use version tacker to enforce atomicity between read/write
+    // return 0 on success or if no topic was associated with the bucket, error code otherwise
+    int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker* objv_tracker);
+    // set the list of topics associated with a bucket
+    // use version tacker to enforce atomicity between read/write
+    // return 0 on success, error code otherwise
+    int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
+                    RGWObjVersionTracker* objv_tracker, optional_yield y);
+  public:
+    Bucket(RGWPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) {
+      ps->get_bucket_meta_obj(bucket, &bucket_meta_obj);
+    }
+
+    // read the list of topics associated with a bucket and populate into result
+    // return 0 on success or if no topic was associated with the bucket, error code otherwise
+    int get_topics(rgw_pubsub_bucket_topics *result);
+    // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket
+    // assigning a notification name is optional (needed for S3 compatible notifications)
+    // if the topic already exist on the bucket, the filter event list may be updated
+    // for S3 compliant notifications the version with: s3_filter and notif_name should be used
+    // return -ENOENT if the topic does not exists
+    // return 0 on success, error code otherwise
+    int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y);
+    int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y);
+    // remove a topic and filter from bucket
+    // if the topic does not exists on the bucket it is a no-op (considered success)
+    // return -ENOENT if the topic does not exists
+    // return 0 on success, error code otherwise
+    int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y);
+    // remove all notifications (and autogenerated topics) associated with the bucket
+    // return 0 on success or if no topic was associated with the bucket, error code otherwise
+    int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y);
+  };
+
+  using BucketRef = std::shared_ptr<Bucket>;
+
+  BucketRef get_bucket(const rgw_bucket& bucket) {
+    return std::make_shared<Bucket>(this, bucket);
+  }
+
+  void get_meta_obj(rgw_raw_obj *obj) const;
+  void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const;
+
+  void get_sub_meta_obj(const std::string& name, rgw_raw_obj *obj) const;
+
+  // get all topics (per tenant, if used)) and populate them into "result"
+  // return 0 on success or if no topics exist, error code otherwise
+  int get_topics(rgw_pubsub_topics *result);
+  // get a topic with its subscriptions by its name and populate it into "result"
+  // return -ENOENT if the topic does not exists 
+  // return 0 on success, error code otherwise
+  int get_topic(const std::string& name, rgw_pubsub_topic_subs *result);
+  // get a topic with by its name and populate it into "result"
+  // return -ENOENT if the topic does not exists 
+  // return 0 on success, error code otherwise
+  int get_topic(const std::string& name, rgw_pubsub_topic *result);
+  // create a topic with a name only
+  // if the topic already exists it is a no-op (considered success)
+  // return 0 on success, error code otherwise
+  int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
+  // create a topic with push destination information and ARN
+  // if the topic already exists the destination and ARN values may be updated (considered succsess)
+  // return 0 on success, error code otherwise
+  int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y);
+  // remove a topic according to its name
+  // if the topic does not exists it is a no-op (considered success)
+  // return 0 on success, error code otherwise
+  int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
+};
+
+
+template <class T>
+int RGWPubSub::read(const rgw_raw_obj& obj, T* result, RGWObjVersionTracker* objv_tracker)
+{
+  bufferlist bl;
+  int ret = rgw_get_system_obj(svc_sysobj,
+                               obj.pool, obj.oid,
+                               bl,
+                               objv_tracker,
+                               nullptr, null_yield, nullptr, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    decode(*result, iter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+template <class T>
+int RGWPubSub::write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
+                        RGWObjVersionTracker* objv_tracker, optional_yield y)
+{
+  bufferlist bl;
+  encode(info, bl);
+
+  return rgw_put_system_obj(dpp, svc_sysobj, obj.pool, obj.oid,
+                            bl, false, objv_tracker, real_time(), y);
+}
diff --git a/src/rgw/driver/rados/rgw_pubsub_push.cc b/src/rgw/driver/rados/rgw_pubsub_push.cc

new file mode 100644 (file)

index 0000000..2f734c2
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub_push.cc
@@ -0,0 +1,463 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_pubsub_push.h"
+#include <string>
+#include <sstream>
+#include <algorithm>
+#include "include/buffer_fwd.h"
+#include "common/Formatter.h"
+#include "common/iso_8601.h"
+#include "common/async/completion.h"
+#include "rgw_common.h"
+#include "rgw_data_sync.h"
+#include "rgw_pubsub.h"
+#include "acconfig.h"
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+#include "rgw_amqp.h"
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+#include "rgw_kafka.h"
+#endif
+#include <boost/asio/yield.hpp>
+#include <boost/algorithm/string.hpp>
+#include <functional>
+#include "rgw_perf_counters.h"
+
+using namespace rgw;
+
+template<typename EventType>
+std::string json_format_pubsub_event(const EventType& event) {
+  std::stringstream ss;
+  JSONFormatter f(false);
+  {
+    Formatter::ObjectSection s(f, EventType::json_type_plural);
+    {
+      Formatter::ArraySection s(f, EventType::json_type_plural);
+      encode_json("", event, &f);
+    }
+  }
+  f.flush(ss);
+  return ss.str();
+}
+  
+bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) {
+  bool value;
+  bool exists;
+  if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) {
+    throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name);
+  }
+  if (!exists) {
+    return default_value;
+  }
+  return value;
+}
+
+class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
+private:
+  const std::string endpoint;
+  typedef unsigned ack_level_t;
+  ack_level_t ack_level; // TODO: not used for now
+  const bool verify_ssl;
+  const bool cloudevents;
+  static const ack_level_t ACK_LEVEL_ANY = 0;
+  static const ack_level_t ACK_LEVEL_NON_ERROR = 1;
+
+public:
+  RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) : 
+    endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false)) 
+  {
+    bool exists;
+    const auto& str_ack_level = args.get("http-ack-level", &exists);
+    if (!exists || str_ack_level == "any") {
+      // "any" is default
+      ack_level = ACK_LEVEL_ANY;
+    } else if (str_ack_level == "non-error") {
+      ack_level = ACK_LEVEL_NON_ERROR;
+    } else {
+      ack_level = std::atoi(str_ack_level.c_str());
+      if (ack_level < 100 || ack_level >= 600) {
+        throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level);
+      }
+    }
+  }
+
+  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+    bufferlist read_bl;
+    RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl);
+    const auto post_data = json_format_pubsub_event(event);
+    if (cloudevents) {
+      // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md
+      // using "Binary Content Mode"
+      request.append_header("ce-specversion", "1.0");
+      request.append_header("ce-type", "com.amazonaws." + event.eventName);
+      request.append_header("ce-time", to_iso_8601(event.eventTime)); 
+      // default output of iso8601 is also RFC3339 compatible
+      request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2);
+      request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name);
+      request.append_header("ce-subject", event.object_key);
+    }
+    request.set_post_data(post_data);
+    request.set_send_length(post_data.length());
+    request.append_header("Content-Type", "application/json");
+    if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
+    const auto rc = RGWHTTP::process(&request, y);
+    if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
+    // TODO: use read_bl to process return code and handle according to ack level
+    return rc;
+  }
+
+  std::string to_str() const override {
+    std::string str("HTTP/S Endpoint");
+    str += "\nURI: " + endpoint;
+    str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL");
+    return str;
+  }
+};
+
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
+private:
+  enum class ack_level_t {
+    None,
+    Broker,
+    Routable
+  };
+  CephContext* const cct;
+  const std::string endpoint;
+  const std::string topic;
+  const std::string exchange;
+  ack_level_t ack_level;
+  amqp::connection_ptr_t conn;
+
+  bool get_verify_ssl(const RGWHTTPArgs& args) {
+    bool exists;
+    auto str_verify_ssl = args.get("verify-ssl", &exists);
+    if (!exists) {
+      // verify server certificate by default
+      return true;
+    }
+    boost::algorithm::to_lower(str_verify_ssl);
+    if (str_verify_ssl == "true") {
+      return true;
+    }
+    if (str_verify_ssl == "false") {
+      return false;
+    }
+    throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl);
+  }
+
+  std::string get_exchange(const RGWHTTPArgs& args) {
+    bool exists;
+    const auto exchange = args.get("amqp-exchange", &exists);
+    if (!exists) {
+      throw configuration_error("AMQP: missing amqp-exchange");
+    }
+    return exchange;
+  }
+
+  ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+    bool exists;
+    const auto& str_ack_level = args.get("amqp-ack-level", &exists);
+    if (!exists || str_ack_level == "broker") {
+      // "broker" is default
+      return ack_level_t::Broker;
+    }
+    if (str_ack_level == "none") {
+      return ack_level_t::None;
+    }
+    if (str_ack_level == "routable") {
+      return ack_level_t::Routable;
+    }
+    throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level);
+  }
+  
+public:
+  RGWPubSubAMQPEndpoint(const std::string& _endpoint,
+      const std::string& _topic,
+      const RGWHTTPArgs& args,
+      CephContext* _cct) : 
+        cct(_cct),
+        endpoint(_endpoint), 
+        topic(_topic),
+        exchange(get_exchange(args)),
+        ack_level(get_ack_level(args)),
+        conn(amqp::connect(endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) {
+    if (!conn) { 
+      throw configuration_error("AMQP: failed to create connection to: " + endpoint);
+    }
+  }
+
+  // this allows waiting untill "finish()" is called from a different thread
+  // waiting could be blocking the waiting thread or yielding, depending
+  // with compilation flag support and whether the optional_yield is set
+  class Waiter {
+    using Signature = void(boost::system::error_code);
+    using Completion = ceph::async::Completion<Signature>;
+    std::unique_ptr<Completion> completion = nullptr;
+    int ret;
+
+    mutable std::atomic<bool> done = false;
+    mutable std::mutex lock;
+    mutable std::condition_variable cond;
+
+    template <typename ExecutionContext, typename CompletionToken>
+    auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+      boost::asio::async_completion<CompletionToken, Signature> init(token);
+      auto& handler = init.completion_handler;
+      {
+        std::unique_lock l{lock};
+        completion = Completion::create(ctx.get_executor(), std::move(handler));
+      }
+      return init.result.get();
+    }
+
+  public:
+    int wait(optional_yield y) {
+      if (done) {
+        return ret;
+      }
+      if (y) {
+       auto& io_ctx = y.get_io_context();
+        auto& yield_ctx = y.get_yield_context();
+        boost::system::error_code ec;
+        async_wait(io_ctx, yield_ctx[ec]);
+        return -ec.value();
+      }
+      std::unique_lock l(lock);
+      cond.wait(l, [this]{return (done==true);});
+      return ret;
+    }
+
+    void finish(int r) {
+      std::unique_lock l{lock};
+      ret = r;
+      done = true;
+      if (completion) {
+        boost::system::error_code ec(-ret, boost::system::system_category());
+        Completion::post(std::move(completion), ec);
+      } else {
+        cond.notify_all();
+      }
+    }
+  };
+
+  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+    ceph_assert(conn);
+    if (ack_level == ack_level_t::None) {
+      return amqp::publish(conn, topic, json_format_pubsub_event(event));
+    } else {
+      // TODO: currently broker and routable are the same - this will require different flags but the same mechanism
+      // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+      auto w = std::unique_ptr<Waiter>(new Waiter);
+      const auto rc = amqp::publish_with_confirm(conn, 
+        topic,
+        json_format_pubsub_event(event),
+        std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+      if (rc < 0) {
+        // failed to publish, does not wait for reply
+        return rc;
+      }
+      return w->wait(y);
+    }
+  }
+
+  std::string to_str() const override {
+    std::string str("AMQP(0.9.1) Endpoint");
+    str += "\nURI: " + endpoint;
+    str += "\nTopic: " + topic;
+    str += "\nExchange: " + exchange;
+    return str;
+  }
+};
+
+static const std::string AMQP_0_9_1("0-9-1");
+static const std::string AMQP_1_0("1-0");
+static const std::string AMQP_SCHEMA("amqp");
+#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT
+
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
+private:
+  enum class ack_level_t {
+    None,
+    Broker,
+  };
+  CephContext* const cct;
+  const std::string topic;
+  kafka::connection_ptr_t conn;
+  const ack_level_t ack_level;
+
+
+  ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+    bool exists;
+    const auto& str_ack_level = args.get("kafka-ack-level", &exists);
+    if (!exists || str_ack_level == "broker") {
+      // "broker" is default
+      return ack_level_t::Broker;
+    }
+    if (str_ack_level == "none") {
+      return ack_level_t::None;
+    }
+    throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level);
+  }
+
+public:
+  RGWPubSubKafkaEndpoint(const std::string& _endpoint,
+      const std::string& _topic,
+      const RGWHTTPArgs& args,
+      CephContext* _cct) : 
+        cct(_cct),
+        topic(_topic),
+        conn(kafka::connect(_endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"))) ,
+        ack_level(get_ack_level(args)) {
+    if (!conn) { 
+      throw configuration_error("Kafka: failed to create connection to: " + _endpoint);
+    }
+  }
+
+  // this allows waiting untill "finish()" is called from a different thread
+  // waiting could be blocking the waiting thread or yielding, depending
+  // with compilation flag support and whether the optional_yield is set
+  class Waiter {
+    using Signature = void(boost::system::error_code);
+    using Completion = ceph::async::Completion<Signature>;
+    std::unique_ptr<Completion> completion = nullptr;
+    int ret;
+
+    mutable std::atomic<bool> done = false;
+    mutable std::mutex lock;
+    mutable std::condition_variable cond;
+
+    template <typename ExecutionContext, typename CompletionToken>
+    auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+      boost::asio::async_completion<CompletionToken, Signature> init(token);
+      auto& handler = init.completion_handler;
+      {
+        std::unique_lock l{lock};
+        completion = Completion::create(ctx.get_executor(), std::move(handler));
+      }
+      return init.result.get();
+    }
+
+  public:
+    int wait(optional_yield y) {
+      if (done) {
+        return ret;
+      }
+      if (y) {
+        auto& io_ctx = y.get_io_context();
+        auto& yield_ctx = y.get_yield_context();
+        boost::system::error_code ec;
+        async_wait(io_ctx, yield_ctx[ec]);
+        return -ec.value();
+      }
+      std::unique_lock l(lock);
+      cond.wait(l, [this]{return (done==true);});
+      return ret;
+    }
+
+    void finish(int r) {
+      std::unique_lock l{lock};
+      ret = r;
+      done = true;
+      if (completion) {
+        boost::system::error_code ec(-ret, boost::system::system_category());
+        Completion::post(std::move(completion), ec);
+      } else {
+        cond.notify_all();
+      }
+    }
+  };
+
+  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+    ceph_assert(conn);
+    if (ack_level == ack_level_t::None) {
+      return kafka::publish(conn, topic, json_format_pubsub_event(event));
+    } else {
+      // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+      auto w = std::unique_ptr<Waiter>(new Waiter);
+      const auto rc = kafka::publish_with_confirm(conn, 
+        topic,
+        json_format_pubsub_event(event),
+        std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+      if (rc < 0) {
+        // failed to publish, does not wait for reply
+        return rc;
+      }
+      return w->wait(y);
+    }
+  }
+
+  std::string to_str() const override {
+    std::string str("Kafka Endpoint");
+    str += kafka::to_string(conn);
+    str += "\nTopic: " + topic;
+    return str;
+  }
+};
+
+static const std::string KAFKA_SCHEMA("kafka");
+#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+
+static const std::string WEBHOOK_SCHEMA("webhook");
+static const std::string UNKNOWN_SCHEMA("unknown");
+static const std::string NO_SCHEMA("");
+
+const std::string& get_schema(const std::string& endpoint) {
+  if (endpoint.empty()) {
+    return NO_SCHEMA; 
+  }
+  const auto pos = endpoint.find(':');
+  if (pos == std::string::npos) {
+    return UNKNOWN_SCHEMA;
+  }
+  const auto& schema = endpoint.substr(0,pos);
+  if (schema == "http" || schema == "https") {
+    return WEBHOOK_SCHEMA;
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+  } else if (schema == "amqp" || schema == "amqps") {
+    return AMQP_SCHEMA;
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+  } else if (schema == "kafka") {
+    return KAFKA_SCHEMA;
+#endif
+  }
+  return UNKNOWN_SCHEMA;
+}
+
+RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, 
+    const std::string& topic, 
+    const RGWHTTPArgs& args,
+    CephContext* cct) {
+  const auto& schema = get_schema(endpoint);
+  if (schema == WEBHOOK_SCHEMA) {
+    return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args));
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+  } else if (schema == AMQP_SCHEMA) {
+    bool exists;
+    std::string version = args.get("amqp-version", &exists);
+    if (!exists) {
+      version = AMQP_0_9_1;
+    }
+    if (version == AMQP_0_9_1) {
+      return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct));
+    } else if (version == AMQP_1_0) {
+      throw configuration_error("AMQP: v1.0 not supported");
+      return nullptr;
+    } else {
+      throw configuration_error("AMQP: unknown version: " + version);
+      return nullptr;
+    }
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+  } else if (schema == KAFKA_SCHEMA) {
+      return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct));
+#endif
+  }
+
+  throw configuration_error("unknown schema in: " + endpoint);
+  return nullptr;
+}
+
diff --git a/src/rgw/driver/rados/rgw_pubsub_push.h b/src/rgw/driver/rados/rgw_pubsub_push.h

new file mode 100644 (file)

index 0000000..1790593
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub_push.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+#pragma once
+
+#include <string>
+#include <memory>
+#include <stdexcept>
+#include "include/buffer_fwd.h"
+#include "include/common_fwd.h"
+#include "common/async/yield_context.h"
+
+// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes
+class RGWDataSyncEnv;
+class RGWHTTPArgs;
+struct rgw_pubsub_s3_event;
+
+// endpoint base class all endpoint  - types should derive from it
+class RGWPubSubEndpoint {
+public:
+  RGWPubSubEndpoint() = default;
+  // endpoint should not be copied
+  RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete;
+  const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete;
+
+  typedef std::unique_ptr<RGWPubSubEndpoint> Ptr;
+
+  // factory method for the actual notification endpoint
+  // derived class specific arguments are passed in http args format
+  // may throw a configuration_error if creation fails
+  static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr);
+ 
+  // this method is used in order to send notification (S3 compliant) and wait for completion 
+  // in async manner via a coroutine when invoked in the frontend environment
+  virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0;
+
+  // present as string
+  virtual std::string to_str() const { return ""; }
+  
+  virtual ~RGWPubSubEndpoint() = default;
+  
+  // exception object for configuration error
+  struct configuration_error : public std::logic_error {
+    configuration_error(const std::string& what_arg) : 
+      std::logic_error("pubsub endpoint configuration error: " + what_arg) {}
+  };
+};
+
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc

new file mode 100644 (file)

index 0000000..8a6a157
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_putobj_processor.cc
@@ -0,0 +1,704 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_aio.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_multi.h"
+#include "rgw_compression.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw::putobj {
+
+int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
+{
+  const bool flush = (data.length() == 0);
+
+  // capture the first chunk for special handling
+  if (data_offset < head_chunk_size || data_offset == 0) {
+    if (flush) {
+      // flush partial chunk
+      return process_first_chunk(std::move(head_data), &processor);
+    }
+
+    auto remaining = head_chunk_size - data_offset;
+    auto count = std::min<uint64_t>(data.length(), remaining);
+    data.splice(0, count, &head_data);
+    data_offset += count;
+
+    if (data_offset == head_chunk_size) {
+      // process the first complete chunk
+      ceph_assert(head_data.length() == head_chunk_size);
+      int r = process_first_chunk(std::move(head_data), &processor);
+      if (r < 0) {
+        return r;
+      }
+    }
+    if (data.length() == 0) { // avoid flushing stripe processor
+      return 0;
+    }
+  }
+  ceph_assert(processor); // process_first_chunk() must initialize
+
+  // send everything else through the processor
+  auto write_offset = data_offset;
+  data_offset += data.length();
+  return processor->process(std::move(data), write_offset);
+}
+
+
+static int process_completed(const AioResultList& completed, RawObjSet *written)
+{
+  std::optional<int> error;
+  for (auto& r : completed) {
+    if (r.result >= 0) {
+      written->insert(r.obj.get_ref().obj);
+    } else if (!error) { // record first error code
+      error = r.result;
+    }
+  }
+  return error.value_or(0);
+}
+
+void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) {
+  const rgw_obj obj = head_obj->get_obj();
+  const RGWObjStateManifest *sm = obj_ctx.get_state(obj);
+  const bool compressed = sm->state.compressed;
+  uint32_t alloc_hint_flags = 0;
+  if (compressed) {
+    alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+  }
+
+  op.set_alloc_hint2(0, 0, alloc_hint_flags);
+}
+
+int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
+{
+  stripe_obj = store->svc()->rados->obj(raw_obj);
+  return stripe_obj.open(dpp);
+}
+
+int RadosWriter::process(bufferlist&& bl, uint64_t offset)
+{
+  bufferlist data = std::move(bl);
+  const uint64_t cost = data.length();
+  if (cost == 0) { // no empty writes, use aio directly for creates
+    return 0;
+  }
+  librados::ObjectWriteOperation op;
+  add_write_hint(op);
+  if (offset == 0) {
+    op.write_full(data);
+  } else {
+    op.write(offset, data);
+  }
+  constexpr uint64_t id = 0; // unused
+  auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+  return process_completed(c, &written);
+}
+
+int RadosWriter::write_exclusive(const bufferlist& data)
+{
+  const uint64_t cost = data.length();
+
+  librados::ObjectWriteOperation op;
+  op.create(true); // exclusive create
+  add_write_hint(op);
+  op.write_full(data);
+
+  constexpr uint64_t id = 0; // unused
+  auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+  auto d = aio->drain();
+  c.splice(c.end(), d);
+  return process_completed(c, &written);
+}
+
+int RadosWriter::drain()
+{
+  return process_completed(aio->drain(), &written);
+}
+
+RadosWriter::~RadosWriter()
+{
+  // wait on any outstanding aio completions
+  process_completed(aio->drain(), &written);
+
+  bool need_to_remove_head = false;
+  std::optional<rgw_raw_obj> raw_head;
+  if (!rgw::sal::Object::empty(head_obj.get())) {
+    raw_head.emplace();
+    rgw::sal::RadosObject* obj = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get());
+    obj->get_raw_obj(&*raw_head);
+  }
+
+  /**
+   * We should delete the object in the "multipart" namespace to avoid race condition.
+   * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
+   * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
+   * written by the second upload may be deleted by the first upload.
+   * details is describled on #11749
+   *
+   * The above comment still stands, but instead of searching for a specific object in the multipart
+   * namespace, we just make sure that we remove the object that is marked as the head object after
+   * we remove all the other raw objects. Note that we use different call to remove the head object,
+   * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
+   */
+  for (const auto& obj : written) {
+    if (raw_head && obj == *raw_head) {
+      ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
+      need_to_remove_head = true;
+      continue;
+    }
+
+    int r = store->delete_raw_obj(dpp, obj);
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
+    }
+  }
+
+  if (need_to_remove_head) {
+    std::string version_id;
+    ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
+    int r = head_obj->delete_object(dpp, null_yield);
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
+    }
+  }
+}
+
+
+// advance to the next stripe
+int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
+{
+  // advance the manifest
+  int r = manifest_gen.create_next(offset);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+  uint64_t chunk_size = 0;
+  r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
+  if (r < 0) {
+    return r;
+  }
+  r = writer.set_stripe_obj(stripe_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  chunk = ChunkProcessor(&writer, chunk_size);
+  *pstripe_size = manifest_gen.cur_stripe_max_size();
+  return 0;
+}
+
+
+
+int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
+                                               DataProcessor **processor)
+{
+  first_chunk = std::move(data);
+  *processor = &stripe;
+  return 0;
+}
+
+int AtomicObjectProcessor::prepare(optional_yield y)
+{
+  uint64_t max_head_chunk_size;
+  uint64_t head_max_size;
+  uint64_t chunk_size = 0;
+  uint64_t alignment;
+
+  int r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(
+                                      dpp, head_obj->get_bucket()->get_placement_rule(),
+                                      &max_head_chunk_size, &alignment);
+  if (r < 0) {
+    return r;
+  }
+
+  bool same_pool = true;
+  if (head_obj->get_bucket()->get_placement_rule() != tail_placement_rule) {
+    if (!head_obj->placement_rules_match(head_obj->get_bucket()->get_placement_rule(), tail_placement_rule)) {
+      same_pool = false;
+      r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(dpp, tail_placement_rule, &chunk_size);
+      if (r < 0) {
+        return r;
+      }
+      head_max_size = 0;
+    }
+  }
+
+  if (same_pool) {
+    RGWZonePlacementInfo placement_info;
+    if (!store->svc()->zone->get_zone_params().get_placement(head_obj->get_bucket()->get_placement_rule().name, &placement_info) || placement_info.inline_data) {
+      head_max_size = max_head_chunk_size;
+    } else {
+      head_max_size = 0;
+    }
+    chunk_size = max_head_chunk_size;
+  }
+
+  uint64_t stripe_size;
+  const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+
+  dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_aligned_size(
+                                       default_stripe_size, alignment, &stripe_size);
+
+  manifest.set_trivial_rule(head_max_size, stripe_size);
+
+  rgw_obj obj = head_obj->get_obj();
+
+  r = manifest_gen.create_begin(store->ctx(), &manifest,
+                                head_obj->get_bucket()->get_placement_rule(),
+                                &tail_placement_rule,
+                                obj.bucket, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+  r = writer.set_stripe_obj(stripe_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  set_head_chunk_size(head_max_size);
+  // initialize the processors
+  chunk = ChunkProcessor(&writer, chunk_size);
+  stripe = StripeProcessor(&chunk, this, head_max_size);
+  return 0;
+}
+
+int AtomicObjectProcessor::complete(size_t accounted_size,
+                                    const std::string& etag,
+                                    ceph::real_time *mtime,
+                                    ceph::real_time set_mtime,
+                                    rgw::sal::Attrs& attrs,
+                                    ceph::real_time delete_at,
+                                    const char *if_match,
+                                    const char *if_nomatch,
+                                    const std::string *user_data,
+                                    rgw_zone_set *zones_trace,
+                                    bool *pcanceled, optional_yield y)
+{
+  int r = writer.drain();
+  if (r < 0) {
+    return r;
+  }
+  const uint64_t actual_size = get_actual_size();
+  r = manifest_gen.create_next(actual_size);
+  if (r < 0) {
+    return r;
+  }
+
+  head_obj->set_atomic();
+
+  RGWRados::Object op_target(store->getRados(),
+                 head_obj->get_bucket(),
+                 obj_ctx, head_obj.get());
+  RGWRados::Object::Write obj_op(&op_target);
+
+  /* some object types shouldn't be versioned, e.g., multipart parts */
+  op_target.set_versioning_disabled(!head_obj->get_bucket()->versioning_enabled());
+  obj_op.meta.data = &first_chunk;
+  obj_op.meta.manifest = &manifest;
+  obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+  obj_op.meta.if_match = if_match;
+  obj_op.meta.if_nomatch = if_nomatch;
+  obj_op.meta.mtime = mtime;
+  obj_op.meta.set_mtime = set_mtime;
+  obj_op.meta.owner = owner;
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.olh_epoch = olh_epoch;
+  obj_op.meta.delete_at = delete_at;
+  obj_op.meta.user_data = user_data;
+  obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
+
+  r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+  if (r < 0) {
+    if (r == -ETIMEDOUT) {
+      // The head object write may eventually succeed, clear the set of objects for deletion. if it
+      // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write
+      writer.clear_written();
+    }
+    return r;
+  }
+  if (!obj_op.meta.canceled) {
+    // on success, clear the set of objects for deletion
+    writer.clear_written();
+  }
+  if (pcanceled) {
+    *pcanceled = obj_op.meta.canceled;
+  }
+  return 0;
+}
+
+
+int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
+                                                  DataProcessor **processor)
+{
+  // write the first chunk of the head object as part of an exclusive create,
+  // then drain to wait for the result in case of EEXIST
+  int r = writer.write_exclusive(data);
+  if (r == -EEXIST) {
+    // randomize the oid prefix and reprepare the head/manifest
+    std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
+
+    mp.init(target_obj->get_name(), upload_id, oid_rand);
+    manifest.set_prefix(target_obj->get_name() + "." + oid_rand);
+
+    r = prepare_head();
+    if (r < 0) {
+      return r;
+    }
+    // resubmit the write op on the new head object
+    r = writer.write_exclusive(data);
+  }
+  if (r < 0) {
+    return r;
+  }
+  *processor = &stripe;
+  return 0;
+}
+
+int MultipartObjectProcessor::prepare_head()
+{
+  const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+  uint64_t chunk_size;
+  uint64_t stripe_size;
+  uint64_t alignment;
+
+  int r = dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_chunk_size(dpp,
+                                         tail_placement_rule, &chunk_size, &alignment);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
+    return r;
+  }
+  dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_aligned_size(
+                                       default_stripe_size, alignment, &stripe_size);
+
+  manifest.set_multipart_part_rule(stripe_size, part_num);
+
+  r = manifest_gen.create_begin(store->ctx(), &manifest,
+                               head_obj->get_bucket()->get_placement_rule(),
+                               &tail_placement_rule,
+                               target_obj->get_bucket()->get_key(),
+                               target_obj->get_obj());
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+  dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->raw_obj_to_obj(stripe_obj);
+  head_obj->set_hash_source(target_obj->get_name());
+
+  r = writer.set_stripe_obj(stripe_obj);
+  if (r < 0) {
+    return r;
+  }
+  stripe_size = manifest_gen.cur_stripe_max_size();
+  set_head_chunk_size(stripe_size);
+
+  chunk = ChunkProcessor(&writer, chunk_size);
+  stripe = StripeProcessor(&chunk, this, stripe_size);
+  return 0;
+}
+
+int MultipartObjectProcessor::prepare(optional_yield y)
+{
+  manifest.set_prefix(target_obj->get_name() + "." + upload_id);
+
+  return prepare_head();
+}
+
+int MultipartObjectProcessor::complete(size_t accounted_size,
+                                       const std::string& etag,
+                                       ceph::real_time *mtime,
+                                       ceph::real_time set_mtime,
+                                       std::map<std::string, bufferlist>& attrs,
+                                       ceph::real_time delete_at,
+                                       const char *if_match,
+                                       const char *if_nomatch,
+                                       const std::string *user_data,
+                                       rgw_zone_set *zones_trace,
+                                       bool *pcanceled, optional_yield y)
+{
+  int r = writer.drain();
+  if (r < 0) {
+    return r;
+  }
+  const uint64_t actual_size = get_actual_size();
+  r = manifest_gen.create_next(actual_size);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWRados::Object op_target(store->getRados(),
+                 head_obj->get_bucket(),
+                 obj_ctx, head_obj.get());
+  RGWRados::Object::Write obj_op(&op_target);
+
+  op_target.set_versioning_disabled(true);
+  op_target.set_meta_placement_rule(&tail_placement_rule);
+  obj_op.meta.set_mtime = set_mtime;
+  obj_op.meta.mtime = mtime;
+  obj_op.meta.owner = owner;
+  obj_op.meta.delete_at = delete_at;
+  obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
+
+  r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+  if (r < 0)
+    return r;
+
+  bufferlist bl;
+  RGWUploadPartInfo info;
+  string p = "part.";
+  bool sorted_omap = is_v2_upload_id(upload_id);
+
+  if (sorted_omap) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%08d", part_num);
+    p.append(buf);
+  } else {
+    p.append(part_num_str);
+  }
+  info.num = part_num;
+  info.etag = etag;
+  info.size = actual_size;
+  info.accounted_size = accounted_size;
+  info.modified = real_clock::now();
+  info.manifest = manifest;
+
+  bool compressed;
+  r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+  if (r < 0) {
+    ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
+    return r;
+  }
+
+  encode(info, bl);
+
+  std::unique_ptr<rgw::sal::Object> meta_obj =
+    head_obj->get_bucket()->get_object(rgw_obj_key(mp.get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
+  meta_obj->set_in_extra_data(true);
+
+  r = meta_obj->omap_set_val_by_key(dpp, p, bl, true, null_yield);
+  if (r < 0) {
+    return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
+  }
+
+  if (!obj_op.meta.canceled) {
+    // on success, clear the set of objects for deletion
+    writer.clear_written();
+  }
+  if (pcanceled) {
+    *pcanceled = obj_op.meta.canceled;
+  }
+  return 0;
+}
+
+int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor)
+{
+  int r = writer.write_exclusive(data);
+  if (r < 0) {
+    return r;
+  }
+  *processor = &stripe;
+  return 0;
+}
+
+int AppendObjectProcessor::prepare(optional_yield y)
+{
+  RGWObjState *astate;
+  int r = head_obj->get_obj_state(dpp, &astate, y);
+  if (r < 0) {
+    return r;
+  }
+  cur_size = astate->size;
+  *cur_accounted_size = astate->accounted_size;
+  if (!astate->exists) {
+    if (position != 0) {
+      ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl;
+      return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+    } else {
+      cur_part_num = 1;
+      //set the prefix
+      char buf[33];
+      gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+      string oid_prefix = head_obj->get_name();
+      oid_prefix.append(".");
+      oid_prefix.append(buf);
+      oid_prefix.append("_");
+      manifest.set_prefix(oid_prefix);
+    }
+  } else {
+    // check whether the object appendable
+    map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+    if (iter == astate->attrset.end()) {
+      ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl;
+      return -ERR_OBJECT_NOT_APPENDABLE;
+    }
+    if (position != *cur_accounted_size) {
+      ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl;
+      return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+    }
+    try {
+      using ceph::decode;
+      decode(cur_part_num, iter->second);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl;
+      return -EIO;
+    }
+    cur_part_num++;
+    //get the current obj etag
+    iter = astate->attrset.find(RGW_ATTR_ETAG);
+    if (iter != astate->attrset.end()) {
+      string s = rgw_string_unquote(iter->second.c_str());
+      size_t pos = s.find("-");
+      cur_etag = s.substr(0, pos);
+    }
+
+    iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+    if (iter != astate->attrset.end()) {
+      tail_placement_rule.storage_class = iter->second.to_str();
+    } else {
+      tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD;
+    }
+    cur_manifest = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_manifest();
+    manifest.set_prefix(cur_manifest->get_prefix());
+    astate->keep_tail = true;
+  }
+  manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
+
+  rgw_obj obj = head_obj->get_obj();
+
+  r = manifest_gen.create_begin(store->ctx(), &manifest, head_obj->get_bucket()->get_placement_rule(), &tail_placement_rule, obj.bucket, obj);
+  if (r < 0) {
+    return r;
+  }
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+  uint64_t chunk_size = 0;
+  r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
+  if (r < 0) {
+    return r;
+  }
+  r = writer.set_stripe_obj(std::move(stripe_obj));
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
+
+  uint64_t max_head_size = std::min(chunk_size, stripe_size);
+  set_head_chunk_size(max_head_size);
+
+  // initialize the processors
+  chunk = ChunkProcessor(&writer, chunk_size);
+  stripe = StripeProcessor(&chunk, this, stripe_size);
+
+  return 0;
+}
+
+int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
+                                    ceph::real_time set_mtime, rgw::sal::Attrs& attrs,
+                                    ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
+                                    const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled,
+                                    optional_yield y)
+{
+  int r = writer.drain();
+  if (r < 0)
+    return r;
+  const uint64_t actual_size = get_actual_size();
+  r = manifest_gen.create_next(actual_size);
+  if (r < 0) {
+    return r;
+  }
+  head_obj->set_atomic();
+  RGWRados::Object op_target(store->getRados(),
+                 head_obj->get_bucket(),
+                 obj_ctx, head_obj.get());
+  RGWRados::Object::Write obj_op(&op_target);
+  //For Append obj, disable versioning
+  op_target.set_versioning_disabled(true);
+  if (cur_manifest) {
+    cur_manifest->append(dpp, manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
+    obj_op.meta.manifest = cur_manifest;
+  } else {
+    obj_op.meta.manifest = &manifest;
+  }
+  obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+  obj_op.meta.mtime = mtime;
+  obj_op.meta.set_mtime = set_mtime;
+  obj_op.meta.owner = owner;
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.delete_at = delete_at;
+  obj_op.meta.user_data = user_data;
+  obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
+  obj_op.meta.appendable = true;
+  //Add the append part number
+  bufferlist cur_part_num_bl;
+  using ceph::encode;
+  encode(cur_part_num, cur_part_num_bl);
+  attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
+  //calculate the etag
+  if (!cur_etag.empty()) {
+    MD5 hash;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+    hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+    hash.Update((const unsigned char *)petag, sizeof(petag));
+    hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+    hash.Update((const unsigned char *)petag, sizeof(petag));
+    hash.Final((unsigned char *)final_etag);
+    buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+    snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],  sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+             "-%lld", (long long)cur_part_num);
+    bufferlist etag_bl;
+    etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
+    attrs[RGW_ATTR_ETAG] = etag_bl;
+  }
+  r = obj_op.write_meta(dpp, actual_size + cur_size,
+                       accounted_size + *cur_accounted_size,
+                       attrs, y);
+  if (r < 0) {
+    return r;
+  }
+  if (!obj_op.meta.canceled) {
+    // on success, clear the set of objects for deletion
+    writer.clear_written();
+  }
+  if (pcanceled) {
+    *pcanceled = obj_op.meta.canceled;
+  }
+  *cur_accounted_size += accounted_size;
+
+  return 0;
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.h b/src/rgw/driver/rados/rgw_putobj_processor.h

new file mode 100644 (file)

index 0000000..1beb9a7
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_putobj_processor.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "rgw_putobj.h"
+#include "services/svc_rados.h"
+#include "services/svc_tier_rados.h"
+#include "rgw_sal.h"
+#include "rgw_obj_manifest.h"
+
+namespace rgw {
+
+namespace sal {
+  class RadosStore;
+}
+
+class Aio;
+
+namespace putobj {
+
+// an object processor with special handling for the first chunk of the head.
+// the virtual process_first_chunk() function returns a processor to handle the
+// rest of the object
+class HeadObjectProcessor : public rgw::sal::ObjectProcessor {
+  uint64_t head_chunk_size;
+  // buffer to capture the first chunk of the head object
+  bufferlist head_data;
+  // initialized after process_first_chunk() to process everything else
+  rgw::sal::DataProcessor *processor = nullptr;
+  uint64_t data_offset = 0; // maximum offset of data written (ie compressed)
+ protected:
+  uint64_t get_actual_size() const { return data_offset; }
+
+  // process the first chunk of data and return a processor for the rest
+  virtual int process_first_chunk(bufferlist&& data,
+                                  rgw::sal::DataProcessor **processor) = 0;
+ public:
+  HeadObjectProcessor(uint64_t head_chunk_size)
+    : head_chunk_size(head_chunk_size)
+  {}
+
+  void set_head_chunk_size(uint64_t size) { head_chunk_size = size; }
+
+  // cache first chunk for process_first_chunk(), then forward everything else
+  // to the returned processor
+  int process(bufferlist&& data, uint64_t logical_offset) final override;
+};
+
+using RawObjSet = std::set<rgw_raw_obj>;
+
+// a data sink that writes to rados objects and deletes them on cancelation
+class RadosWriter : public rgw::sal::DataProcessor {
+  Aio *const aio;
+  rgw::sal::RadosStore *const store;
+  RGWObjectCtx& obj_ctx;
+  std::unique_ptr<rgw::sal::Object> head_obj;
+  RGWSI_RADOS::Obj stripe_obj; // current stripe object
+  RawObjSet written; // set of written objects for deletion
+  const DoutPrefixProvider *dpp;
+  optional_yield y;
+
+ public:
+  RadosWriter(Aio *aio, rgw::sal::RadosStore *store,
+              RGWObjectCtx& obj_ctx, std::unique_ptr<rgw::sal::Object> _head_obj,
+              const DoutPrefixProvider *dpp, optional_yield y)
+    : aio(aio), store(store),
+      obj_ctx(obj_ctx), head_obj(std::move(_head_obj)), dpp(dpp), y(y)
+  {}
+  RadosWriter(RadosWriter&& r)
+    : aio(r.aio), store(r.store),
+      obj_ctx(r.obj_ctx), head_obj(std::move(r.head_obj)), dpp(r.dpp), y(r.y)
+  {}
+
+  ~RadosWriter();
+
+  // add alloc hint to osd
+  void add_write_hint(librados::ObjectWriteOperation& op);
+
+  // change the current stripe object
+  int set_stripe_obj(const rgw_raw_obj& obj);
+
+  // write the data at the given offset of the current stripe object
+  int process(bufferlist&& data, uint64_t stripe_offset) override;
+
+  // write the data as an exclusive create and wait for it to complete
+  int write_exclusive(const bufferlist& data);
+
+  int drain();
+
+  // when the operation completes successfully, clear the set of written objects
+  // so they aren't deleted on destruction
+  void clear_written() { written.clear(); }
+
+};
+
+
+// a rados object processor that stripes according to RGWObjManifest
+class ManifestObjectProcessor : public HeadObjectProcessor,
+                                public StripeGenerator {
+ protected:
+  rgw::sal::RadosStore* const store;
+  rgw_placement_rule tail_placement_rule;
+  rgw_user owner;
+  RGWObjectCtx& obj_ctx;
+  std::unique_ptr<rgw::sal::Object> head_obj;
+
+  RadosWriter writer;
+  RGWObjManifest manifest;
+  RGWObjManifest::generator manifest_gen;
+  ChunkProcessor chunk;
+  StripeProcessor stripe;
+  const DoutPrefixProvider *dpp;
+
+  // implements StripeGenerator
+  int next(uint64_t offset, uint64_t *stripe_size) override;
+
+ public:
+  ManifestObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+                          const rgw_placement_rule *ptail_placement_rule,
+                          const rgw_user& owner, RGWObjectCtx& _obj_ctx,
+                          std::unique_ptr<rgw::sal::Object> _head_obj,
+                          const DoutPrefixProvider* dpp, optional_yield y)
+    : HeadObjectProcessor(0),
+      store(store),
+      owner(owner),
+      obj_ctx(_obj_ctx), head_obj(std::move(_head_obj)),
+      writer(aio, store, obj_ctx, head_obj->clone(), dpp, y),
+      chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) {
+        if (ptail_placement_rule) {
+          tail_placement_rule = *ptail_placement_rule;
+        }
+      }
+
+  void set_owner(const rgw_user& _owner) {
+    owner = _owner;
+  }
+
+  void set_tail_placement(const rgw_placement_rule& tpr) {
+    tail_placement_rule = tpr;
+  }
+  void set_tail_placement(const rgw_placement_rule&& tpr) {
+    tail_placement_rule = tpr;
+  }
+
+};
+
+
+// a processor that completes with an atomic write to the head object as part of
+// a bucket index transaction
+class AtomicObjectProcessor : public ManifestObjectProcessor {
+  const std::optional<uint64_t> olh_epoch;
+  const std::string unique_tag;
+  bufferlist first_chunk; // written with the head in complete()
+
+  int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+ public:
+  AtomicObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+                        const rgw_placement_rule *ptail_placement_rule,
+                        const rgw_user& owner,
+                        RGWObjectCtx& obj_ctx,
+                       std::unique_ptr<rgw::sal::Object> _head_obj,
+                        std::optional<uint64_t> olh_epoch,
+                        const std::string& unique_tag,
+                        const DoutPrefixProvider *dpp, optional_yield y)
+    : ManifestObjectProcessor(aio, store, ptail_placement_rule,
+                              owner, obj_ctx, std::move(_head_obj), dpp, y),
+      olh_epoch(olh_epoch), unique_tag(unique_tag)
+  {}
+
+  // prepare a trivial manifest
+  int prepare(optional_yield y) override;
+  // write the head object atomically in a bucket index transaction
+  int complete(size_t accounted_size, const std::string& etag,
+               ceph::real_time *mtime, ceph::real_time set_mtime,
+               std::map<std::string, bufferlist>& attrs,
+               ceph::real_time delete_at,
+               const char *if_match, const char *if_nomatch,
+               const std::string *user_data,
+               rgw_zone_set *zones_trace, bool *canceled,
+               optional_yield y) override;
+
+};
+
+
+// a processor for multipart parts, which don't require atomic completion. the
+// part's head is written with an exclusive create to detect racing uploads of
+// the same part/upload id, which are restarted with a random oid prefix
+class MultipartObjectProcessor : public ManifestObjectProcessor {
+  std::unique_ptr<rgw::sal::Object> target_obj; // target multipart object
+  const std::string upload_id;
+  const int part_num;
+  const std::string part_num_str;
+  RGWMPObj mp;
+
+  // write the first chunk and wait on aio->drain() for its completion.
+  // on EEXIST, retry with random prefix
+  int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+  // prepare the head stripe and manifest
+  int prepare_head();
+ public:
+  MultipartObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+                           const rgw_placement_rule *ptail_placement_rule,
+                           const rgw_user& owner, RGWObjectCtx& obj_ctx,
+                           std::unique_ptr<rgw::sal::Object> _head_obj,
+                           const std::string& upload_id, uint64_t part_num,
+                           const std::string& part_num_str,
+                           const DoutPrefixProvider *dpp, optional_yield y)
+    : ManifestObjectProcessor(aio, store, ptail_placement_rule,
+                              owner, obj_ctx, std::move(_head_obj), dpp, y),
+      target_obj(head_obj->clone()), upload_id(upload_id),
+      part_num(part_num), part_num_str(part_num_str),
+      mp(head_obj->get_name(), upload_id)
+  {}
+
+  // prepare a multipart manifest
+  int prepare(optional_yield y) override;
+  // write the head object attributes in a bucket index transaction, then
+  // register the completed part with the multipart meta object
+  int complete(size_t accounted_size, const std::string& etag,
+               ceph::real_time *mtime, ceph::real_time set_mtime,
+               std::map<std::string, bufferlist>& attrs,
+               ceph::real_time delete_at,
+               const char *if_match, const char *if_nomatch,
+               const std::string *user_data,
+               rgw_zone_set *zones_trace, bool *canceled,
+               optional_yield y) override;
+
+};
+
+  class AppendObjectProcessor : public ManifestObjectProcessor {
+    uint64_t cur_part_num;
+    uint64_t position;
+    uint64_t cur_size;
+    uint64_t *cur_accounted_size;
+    std::string cur_etag;
+    const std::string unique_tag;
+
+    RGWObjManifest *cur_manifest;
+
+    int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+
+  public:
+    AppendObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+                          const rgw_placement_rule *ptail_placement_rule,
+                          const rgw_user& owner, RGWObjectCtx& obj_ctx,
+                         std::unique_ptr<rgw::sal::Object> _head_obj,
+                          const std::string& unique_tag, uint64_t position,
+                          uint64_t *cur_accounted_size,
+                          const DoutPrefixProvider *dpp, optional_yield y)
+            : ManifestObjectProcessor(aio, store, ptail_placement_rule,
+                                      owner, obj_ctx, std::move(_head_obj), dpp, y),
+              position(position), cur_size(0), cur_accounted_size(cur_accounted_size),
+              unique_tag(unique_tag), cur_manifest(nullptr)
+    {}
+    int prepare(optional_yield y) override;
+    int complete(size_t accounted_size, const std::string& etag,
+                 ceph::real_time *mtime, ceph::real_time set_mtime,
+                 std::map<std::string, bufferlist>& attrs, ceph::real_time delete_at,
+                 const char *if_match, const char *if_nomatch, const std::string *user_data,
+                 rgw_zone_set *zones_trace, bool *canceled,
+                 optional_yield y) override;
+  };
+
+} // namespace putobj
+} // namespace rgw
+
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc

new file mode 100644 (file)

index 0000000..6779e51
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -0,0 +1,9715 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sstream>
+
+#include <boost/algorithm/string.hpp>
+#include <string_view>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/format.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+
+#include "common/ceph_json.h"
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Throttle.h"
+#include "common/BackTrace.h"
+
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_cache.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
+#include "rgw_aio_throttle.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_putobj_processor.h"
+
+#include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw/cls_rgw_const.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "osd/osd_types.h"
+
+#include "rgw_tools.h"
+#include "rgw_coroutine.h"
+#include "rgw_compression.h"
+#include "rgw_etag_verifier.h"
+#include "rgw_worker.h"
+#include "rgw_notify.h"
+#include "rgw_http_errors.h"
+
+#undef fork // fails to compile RGWPeriod::fork() below
+
+#include "common/Clock.h"
+
+#include <string>
+#include <iostream>
+#include <vector>
+#include <atomic>
+#include <list>
+#include <map>
+#include "include/random.h"
+
+#include "rgw_gc.h"
+#include "rgw_lc.h"
+
+#include "rgw_object_expirer_core.h"
+#include "rgw_sync.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_trace.h"
+#include "rgw_trim_datalog.h"
+#include "rgw_trim_mdlog.h"
+#include "rgw_data_sync.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_reshard.h"
+#include "rgw_cr_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_bucket.h"
+#include "services/svc_mdlog.h"
+
+#include "compressor/Compressor.h"
+
+#include "rgw_d3n_datacache.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/rgw_rados.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: "
+#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: "
+#define dendl_bitx                      dendl ; }
+
+static string shadow_ns = "shadow";
+static string default_bucket_index_pool_suffix = "rgw.buckets.index";
+static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
+
+static RGWObjCategory main_category = RGWObjCategory::Main;
+#define RGW_USAGE_OBJ_PREFIX "usage."
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* driver) const
+{
+  if (!is_raw) {
+    rgw_raw_obj r;
+    driver->get_raw_obj(placement_rule, obj, &r);
+    return r;
+  }
+  return raw_obj;
+}
+
+void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
+{
+  obj_version* check_objv = version_for_check();
+
+  if (check_objv) {
+    cls_version_check(*op, *check_objv, VER_COND_EQ);
+  }
+
+  cls_version_read(*op, &read_version);
+}
+
+void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
+{
+  obj_version* check_objv = version_for_check();
+  obj_version* modify_version = version_for_write();
+
+  if (check_objv) {
+    cls_version_check(*op, *check_objv, VER_COND_EQ);
+  }
+
+  if (modify_version) {
+    cls_version_set(*op, *modify_version);
+  } else {
+    cls_version_inc(*op);
+  }
+}
+
+void RGWObjVersionTracker::apply_write()
+{
+  const bool checked = (read_version.ver != 0);
+  const bool incremented = (write_version.ver == 0);
+
+  if (checked && incremented) {
+    // apply cls_version_inc() so our next operation can recheck it
+    ++read_version.ver;
+  } else {
+    read_version = write_version;
+  }
+  write_version = obj_version();
+}
+
+RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) {
+  RGWObjStateManifest *result;
+  typename std::map<rgw_obj, RGWObjStateManifest>::iterator iter;
+  lock.lock_shared();
+  assert (!obj.empty());
+  iter = objs_state.find(obj);
+  if (iter != objs_state.end()) {
+    result = &iter->second;
+    lock.unlock_shared();
+  } else {
+    lock.unlock_shared();
+    lock.lock();
+    result = &objs_state[obj];
+    lock.unlock();
+  }
+  return result;
+}
+
+void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  assert (!obj.empty());
+  objs_state[obj].state.compressed = true;
+}
+
+void RGWObjectCtx::set_atomic(rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  assert (!obj.empty());
+  objs_state[obj].state.is_atomic = true;
+}
+void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  assert (!obj.empty());
+  objs_state[obj].state.prefetch_data = true;
+}
+
+void RGWObjectCtx::invalidate(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  auto iter = objs_state.find(obj);
+  if (iter == objs_state.end()) {
+    return;
+  }
+  bool is_atomic = iter->second.state.is_atomic;
+  bool prefetch_data = iter->second.state.prefetch_data;
+  bool compressed = iter->second.state.compressed;
+
+  objs_state.erase(iter);
+
+  if (is_atomic || prefetch_data) {
+    auto& sm = objs_state[obj];
+    sm.state.is_atomic = is_atomic;
+    sm.state.prefetch_data = prefetch_data;
+    sm.state.compressed = compressed;
+  }
+}
+
+class RGWMetaNotifierManager : public RGWCoroutinesManager {
+  RGWRados* store;
+  RGWHTTPManager http_manager;
+
+public:
+  RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+                                             http_manager(store->ctx(), completion_mgr) {
+    http_manager.start();
+  }
+
+  int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
+    rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                    { "notify", NULL },
+                                    { NULL, NULL } };
+
+    list<RGWCoroutinesStack *> stacks;
+    for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+      RGWRESTConn *conn = iter->second;
+      RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+      stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
+
+      stacks.push_back(stack);
+    }
+    return run(dpp, stacks);
+  }
+};
+
+class RGWDataNotifierManager : public RGWCoroutinesManager {
+  RGWRados* store;
+  RGWHTTPManager http_manager;
+
+public:
+  RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+                                             http_manager(store->ctx(), completion_mgr) {
+    http_manager.start();
+  }
+
+  int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
+               bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards) {
+
+    list<RGWCoroutinesStack *> stacks;
+    const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str();
+    for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+      RGWRESTConn *conn = iter->second;
+      RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+      stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn));
+      stacks.push_back(stack);
+    }
+
+    return run(dpp, stacks);
+  }
+};
+
+/* class RGWRadosThread */
+
+void RGWRadosThread::start()
+{
+  worker = new Worker(cct, this);
+  worker->create(thread_name.c_str());
+}
+
+void RGWRadosThread::stop()
+{
+  down_flag = true;
+  stop_process();
+  if (worker) {
+    worker->signal();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+void *RGWRadosThread::Worker::entry() {
+  uint64_t msec = processor->interval_msec();
+  auto interval = std::chrono::milliseconds(msec);
+
+  do {
+    auto start = ceph::real_clock::now();
+    int r = processor->process(this);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
+    }
+
+    if (processor->going_down())
+      break;
+
+    auto end = ceph::real_clock::now() - start;
+
+    uint64_t cur_msec = processor->interval_msec();
+    if (cur_msec != msec) { /* was it reconfigured? */
+      msec = cur_msec;
+      interval = std::chrono::milliseconds(msec);
+    }
+
+    if (cur_msec > 0) {
+      if (interval <= end)
+        continue; // next round
+
+      auto wait_time = interval - end;
+      wait_interval(wait_time);
+    } else {
+      wait();
+    }
+  } while (!processor->going_down());
+
+  return NULL;
+}
+
+class RGWMetaNotifier : public RGWRadosThread {
+  RGWMetaNotifierManager notify_mgr;
+  RGWMetadataLog *const log;
+
+  uint64_t interval_msec() override {
+    return cct->_conf->rgw_md_notify_interval_msec;
+  }
+  void stop_process() override {
+    notify_mgr.stop();
+  }
+public:
+  RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log)
+    : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {}
+
+  int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
+{
+  set<int> shards;
+
+  log->read_clear_modified(shards);
+
+  if (shards.empty()) {
+    return 0;
+  }
+
+  for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
+    ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
+  }
+
+  notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
+
+  return 0;
+}
+
+class RGWDataNotifier : public RGWRadosThread {
+  RGWDataNotifierManager notify_mgr;
+  bc::flat_set<rgw_data_notify_entry> entry;
+
+  uint64_t interval_msec() override {
+    return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
+  }
+  void stop_process() override {
+    notify_mgr.stop();
+  }
+public:
+  RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {}
+
+  int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
+{
+  auto data_log = store->svc.datalog_rados;
+  if (!data_log) {
+    return 0;
+  }
+
+  auto shards = data_log->read_clear_modified();
+
+  if (shards.empty()) {
+    return 0;
+  }
+
+  for (const auto& [shard_id, entries] : shards) {
+    bc::flat_set<rgw_data_notify_entry>::iterator it;
+    for (const auto& entry : entries) {
+      ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
+        << shard_id << ":" << entry.gen << ":" << entry.key << dendl;
+    }
+  }
+
+  notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
+
+  return 0;
+}
+
+class RGWSyncProcessorThread : public RGWRadosThread {
+public:
+  RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {}
+  RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {}
+  ~RGWSyncProcessorThread() override {}
+  int init(const DoutPrefixProvider *dpp) override = 0 ;
+  int process(const DoutPrefixProvider *dpp) override = 0;
+};
+
+class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
+{
+  RGWMetaSyncStatusManager sync;
+
+  uint64_t interval_msec() override {
+    return 0; /* no interval associated, it'll run once until stopped */
+  }
+  void stop_process() override {
+    sync.stop();
+  }
+public:
+  RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados)
+    : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {}
+
+  void wakeup_sync_shards(set<int>& shard_ids) {
+    for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
+      sync.wakeup(*iter);
+    }
+  }
+  RGWMetaSyncStatusManager* get_manager() { return &sync; }
+
+  int init(const DoutPrefixProvider *dpp) override {
+    int ret = sync.init(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+
+  int process(const DoutPrefixProvider *dpp) override {
+    sync.run(dpp, null_yield);
+    return 0;
+  }
+};
+
+class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
+{
+  PerfCountersRef counters;
+  RGWDataSyncStatusManager sync;
+  bool initialized;
+
+  uint64_t interval_msec() override {
+    if (initialized) {
+      return 0; /* no interval associated, it'll run once until stopped */
+    } else {
+#define DATA_SYNC_INIT_WAIT_SEC 20
+      return DATA_SYNC_INIT_WAIT_SEC * 1000;
+    }
+  }
+  void stop_process() override {
+    sync.stop();
+  }
+public:
+  RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+                             const RGWZone* source_zone)
+    : RGWSyncProcessorThread(_driver->getRados(), "data-sync"),
+      counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
+      sync(_driver, async_rados, source_zone->id, counters.get()),
+      initialized(false) {}
+
+  void wakeup_sync_shards(bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries) {
+    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+      sync.wakeup(iter->first, iter->second);
+    }
+  }
+
+  RGWDataSyncStatusManager* get_manager() { return &sync; }
+
+  int init(const DoutPrefixProvider *dpp) override {
+    return 0;
+  }
+
+  int process(const DoutPrefixProvider *dpp) override {
+    while (!initialized) {
+      if (going_down()) {
+        return 0;
+      }
+      int ret = sync.init(dpp);
+      if (ret >= 0) {
+        initialized = true;
+        break;
+      }
+      /* we'll be back! */
+      return 0;
+    }
+    sync.run(dpp);
+    return 0;
+  }
+};
+
+class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
+{
+  RGWCoroutinesManager crs;
+  rgw::sal::RadosStore* store;
+  rgw::BucketTrimManager *bucket_trim;
+  RGWHTTPManager http;
+  const utime_t trim_interval;
+
+  uint64_t interval_msec() override { return 0; }
+  void stop_process() override { crs.stop(); }
+public:
+  RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
+                       int interval)
+    : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
+      crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
+      bucket_trim(bucket_trim),
+      http(store->ctx(), crs.get_completion_mgr()),
+      trim_interval(interval, 0)
+  {}
+
+  int init(const DoutPrefixProvider *dpp) override {
+    return http.start();
+  }
+  int process(const DoutPrefixProvider *dpp) override {
+    list<RGWCoroutinesStack*> stacks;
+    auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
+                                             cct->_conf->rgw_md_log_max_shards,
+                                             trim_interval);
+    if (!metatrimcr) {
+      ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
+      return -EINVAL;
+    }
+    auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
+    meta->call(metatrimcr);
+
+    stacks.push_back(meta);
+
+    if (store->svc()->zone->sync_module_exports_data()) {
+      auto data = new RGWCoroutinesStack(store->ctx(), &crs);
+      data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
+                                         cct->_conf->rgw_data_log_num_shards,
+                                         trim_interval));
+      stacks.push_back(data);
+
+      auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
+      bucket->call(bucket_trim->create_bucket_trim_cr(&http));
+      stacks.push_back(bucket);
+    }
+
+    crs.run(dpp, stacks);
+    return 0;
+  }
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override { return store->ctx(); }
+  unsigned get_subsys() const override
+  {
+    return dout_subsys;
+  }
+
+  std::ostream& gen_prefix(std::ostream& out) const override
+  {
+    return out << "sync log trim: ";
+  }
+
+};
+
+void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
+{
+  std::lock_guard l{meta_sync_thread_lock};
+  if (meta_sync_processor_thread) {
+    meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
+  }
+}
+
+void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries)
+{
+  ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl;
+  for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+    ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+    bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+    for (const auto& [key, gen] : entries) {
+      ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key
+                        << ", gen=" << gen << dendl;
+    }
+  }
+
+  std::lock_guard l{data_sync_thread_lock};
+  auto iter = data_sync_processor_threads.find(source_zone);
+  if (iter == data_sync_processor_threads.end()) {
+    ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
+    return;
+  }
+
+  RGWDataSyncProcessorThread *thread = iter->second;
+  ceph_assert(thread);
+  thread->wakeup_sync_shards(entries);
+}
+
+RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
+{
+  std::lock_guard l{meta_sync_thread_lock};
+  if (meta_sync_processor_thread) {
+    return meta_sync_processor_thread->get_manager();
+  }
+  return nullptr;
+}
+
+RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+  std::lock_guard l{data_sync_thread_lock};
+  auto thread = data_sync_processor_threads.find(source_zone);
+  if (thread == data_sync_processor_threads.end()) {
+    return nullptr;
+  }
+  return thread->second->get_manager();
+}
+
+int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
+{
+  IoCtx ioctx;
+  int r = open_pool_ctx(dpp, pool, ioctx, false);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
+    return r;
+  }
+
+  bool req;
+  r = ioctx.pool_requires_alignment2(&req);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
+      << r << dendl;
+    return r;
+  }
+
+  if (!req) {
+    *alignment = 0;
+    return 0;
+  }
+
+  uint64_t align;
+  r = ioctx.pool_required_alignment2(&align);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
+      << r << dendl;
+    return r;
+  }
+  if (align != 0) {
+    ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
+  }
+  *alignment = align;
+  return 0;
+}
+
+void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
+{
+  if (alignment == 0) {
+    *max_size = size;
+    return;
+  }
+
+  if (size <= alignment) {
+    *max_size = alignment;
+    return;
+  }
+
+  *max_size = size - (size % alignment);
+}
+
+int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+  uint64_t alignment;
+  int r = get_required_alignment(dpp, pool, &alignment);
+  if (r < 0) {
+    return r;
+  }
+
+  if (palignment) {
+    *palignment = alignment;
+  }
+
+  uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
+
+  get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
+
+  ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
+
+  return 0;
+}
+
+int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
+                                 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+  rgw_pool pool;
+  if (!get_obj_data_pool(placement_rule, obj, &pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
+    return -EIO;
+  }
+  return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
+}
+
+void add_datalog_entry(const DoutPrefixProvider* dpp,
+                       RGWDataChangesLog* datalog,
+                       const RGWBucketInfo& bucket_info,
+                       uint32_t shard_id)
+{
+  const auto& logs = bucket_info.layout.logs;
+  if (logs.empty()) {
+    return;
+  }
+  int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
+  } // datalog error is not fatal
+}
+
+class RGWIndexCompletionManager;
+
+struct complete_op_data {
+  ceph::mutex lock = ceph::make_mutex("complete_op_data");
+  AioCompletion *rados_completion{nullptr};
+  int manager_shard_id{-1};
+  RGWIndexCompletionManager *manager{nullptr};
+  rgw_obj obj;
+  RGWModifyOp op;
+  string tag;
+  rgw_bucket_entry_ver ver;
+  cls_rgw_obj_key key;
+  rgw_bucket_dir_entry_meta dir_meta;
+  list<cls_rgw_obj_key> remove_objs;
+  bool log_op;
+  uint16_t bilog_op;
+  rgw_zone_set zones_trace;
+
+  bool stopped{false};
+
+  void stop() {
+    std::lock_guard l{lock};
+    stopped = true;
+  }
+};
+
+class RGWIndexCompletionManager {
+  RGWRados* const store;
+  const uint32_t num_shards;
+  ceph::containers::tiny_vector<ceph::mutex> locks;
+  std::vector<set<complete_op_data*>> completions;
+  std::vector<complete_op_data*> retry_completions;
+
+  std::condition_variable cond;
+  std::mutex retry_completions_lock;
+  bool _stop{false};
+  std::thread retry_thread;
+
+  // used to distribute the completions and the locks they use across
+  // their respective vectors; it will get incremented and can wrap
+  // around back to 0 without issue
+  std::atomic<uint32_t> cur_shard {0};
+
+  void process();
+  
+  void add_completion(complete_op_data *completion);
+  
+  void stop() {
+    if (retry_thread.joinable()) {
+      _stop = true;
+      cond.notify_all();
+      retry_thread.join();
+    }
+
+    for (uint32_t i = 0; i < num_shards; ++i) {
+      std::lock_guard l{locks[i]};
+      for (auto c : completions[i]) {
+        c->stop();
+      }
+    }
+    completions.clear();
+  }
+  
+  uint32_t next_shard() {
+    return cur_shard++ % num_shards;
+  }
+
+public:
+  RGWIndexCompletionManager(RGWRados *_driver) :
+    store(_driver),
+    num_shards(store->ctx()->_conf->rgw_thread_pool_size),
+    locks{ceph::make_lock_container<ceph::mutex>(
+      num_shards,
+      [](const size_t i) {
+        return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
+                               std::to_string(i));
+      })},
+    completions(num_shards),
+    retry_thread(&RGWIndexCompletionManager::process, this)
+    {}
+
+  ~RGWIndexCompletionManager() {
+    stop();
+  }
+
+  void create_completion(const rgw_obj& obj,
+                         RGWModifyOp op, string& tag,
+                         rgw_bucket_entry_ver& ver,
+                         const cls_rgw_obj_key& key,
+                         rgw_bucket_dir_entry_meta& dir_meta,
+                         list<cls_rgw_obj_key> *remove_objs, bool log_op,
+                         uint16_t bilog_op,
+                         rgw_zone_set *zones_trace,
+                         complete_op_data **result);
+
+  bool handle_completion(completion_t cb, complete_op_data *arg);
+
+  CephContext* ctx() {
+    return store->ctx();
+  }
+};
+
+static void obj_complete_cb(completion_t cb, void *arg)
+{
+  complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
+  completion->lock.lock();
+  if (completion->stopped) {
+    completion->lock.unlock(); /* can drop lock, no one else is referencing us */
+    delete completion;
+    return;
+  }
+  bool need_delete = completion->manager->handle_completion(cb, completion);
+  completion->lock.unlock();
+  if (need_delete) {
+    delete completion;
+  }
+}
+
+void RGWIndexCompletionManager::process()
+{
+  DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
+  while(!_stop) {
+    std::vector<complete_op_data*> comps;
+
+    {
+      std::unique_lock l{retry_completions_lock};
+      cond.wait(l, [this](){return _stop || !retry_completions.empty();});
+      if (_stop) {
+        return;
+      }
+      retry_completions.swap(comps);
+    }
+
+    for (auto c : comps) {
+      std::unique_ptr<complete_op_data> up{c};
+
+      ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
+
+      RGWRados::BucketShard bs(store);
+      RGWBucketInfo bucket_info;
+
+      int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
+      if (r < 0) {
+        ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
+        /* not much to do */
+        continue;
+      }
+
+      r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
+                            [&](RGWRados::BucketShard *bs) -> int {
+                              const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation;
+                              ldout_bitx(bitx, &dpp, 10) <<
+                                "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+                                " obj=" << c->obj << " tag=" << c->tag <<
+                                " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx;
+                              ldout_bitx(bitx, &dpp, 25) <<
+                                "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx;
+
+                              librados::ObjectWriteOperation o;
+                              o.assert_exists();
+                              cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+                              cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+                                                         c->log_op, c->bilog_op, &c->zones_trace);
+                              int ret = bs->bucket_obj.operate(&dpp, &o, null_yield);
+                              ldout_bitx(bitx, &dpp, 10) <<
+                                "EXITING " << __func__ << ": ret=" << dendl_bitx;
+                              return ret;
+                             });
+      if (r < 0) {
+        ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
+        /* ignoring error, can't do anything about it */
+        continue;
+      }
+
+      add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id);
+    }
+  }
+}
+
+void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
+                                                  RGWModifyOp op, string& tag,
+                                                  rgw_bucket_entry_ver& ver,
+                                                  const cls_rgw_obj_key& key,
+                                                  rgw_bucket_dir_entry_meta& dir_meta,
+                                                  list<cls_rgw_obj_key> *remove_objs, bool log_op,
+                                                  uint16_t bilog_op,
+                                                  rgw_zone_set *zones_trace,
+                                                  complete_op_data **result)
+{
+  complete_op_data *entry = new complete_op_data;
+
+  int shard_id = next_shard();
+
+  entry->manager_shard_id = shard_id;
+  entry->manager = this;
+  entry->obj = obj;
+  entry->op = op;
+  entry->tag = tag;
+  entry->ver = ver;
+  entry->key = key;
+  entry->dir_meta = dir_meta;
+  entry->log_op = log_op;
+  entry->bilog_op = bilog_op;
+
+  if (remove_objs) {
+    for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
+      entry->remove_objs.push_back(*iter);
+    }
+  }
+
+  if (zones_trace) {
+    entry->zones_trace = *zones_trace;
+  } else {
+    entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
+  }
+
+  *result = entry;
+
+  entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
+
+  std::lock_guard l{locks[shard_id]};
+  const auto ok = completions[shard_id].insert(entry).second;
+  ceph_assert(ok);
+}
+
+void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
+  {
+    std::lock_guard l{retry_completions_lock};
+    retry_completions.push_back(completion);
+  }
+  cond.notify_all();
+}
+
+bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
+{
+  int shard_id = arg->manager_shard_id;
+  {
+    std::lock_guard l{locks[shard_id]};
+
+    auto& comps = completions[shard_id];
+
+    auto iter = comps.find(arg);
+    if (iter == comps.end()) {
+      ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
+      return true;
+    }
+
+    comps.erase(iter);
+  }
+
+  int r = rados_aio_get_return_value(cb);
+  if (r != -ERR_BUSY_RESHARDING) {
+    ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " << 
+      (r == 0 ? "ok" : "failed with " + to_string(r)) << 
+      " for obj=" << arg->key << dendl;
+    return true;
+  }
+  add_completion(arg);
+  ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
+  return false;
+}
+
+void RGWRados::finalize()
+{
+  /* Before joining any sync threads, drain outstanding requests &
+   * mark the async_processor as going_down() */
+  if (svc.rados) {
+    svc.rados->stop_processor();
+  }
+
+  if (run_sync_thread) {
+    std::lock_guard l{meta_sync_thread_lock};
+    meta_sync_processor_thread->stop();
+
+    std::lock_guard dl{data_sync_thread_lock};
+    for (auto iter : data_sync_processor_threads) {
+      RGWDataSyncProcessorThread *thread = iter.second;
+      thread->stop();
+    }
+    if (sync_log_trimmer) {
+      sync_log_trimmer->stop();
+    }
+  }
+  if (run_sync_thread) {
+    delete meta_sync_processor_thread;
+    meta_sync_processor_thread = NULL;
+    std::lock_guard dl{data_sync_thread_lock};
+    for (auto iter : data_sync_processor_threads) {
+      RGWDataSyncProcessorThread *thread = iter.second;
+      delete thread;
+    }
+    data_sync_processor_threads.clear();
+    delete sync_log_trimmer;
+    sync_log_trimmer = nullptr;
+    bucket_trim = boost::none;
+  }
+  if (meta_notifier) {
+    meta_notifier->stop();
+    delete meta_notifier;
+  }
+  if (data_notifier) {
+    data_notifier->stop();
+    delete data_notifier;
+  }
+  delete sync_tracer;
+  
+  delete lc;
+  lc = NULL; 
+
+  delete gc;
+  gc = NULL;
+
+  delete obj_expirer;
+  obj_expirer = NULL;
+
+  RGWQuotaHandler::free_handler(quota_handler);
+  if (cr_registry) {
+    cr_registry->put();
+  }
+
+  svc.shutdown();
+
+  delete binfo_cache;
+  delete obj_tombstone_cache;
+  if (d3n_data_cache)
+    delete d3n_data_cache;
+
+  if (reshard_wait.get()) {
+    reshard_wait->stop();
+    reshard_wait.reset();
+  }
+
+  if (run_reshard_thread) {
+    reshard->stop_processor();
+  }
+  delete reshard;
+  delete index_completion_manager;
+
+  rgw::notify::shutdown();
+}
+
+/** 
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_rados()
+{
+  int ret = 0;
+
+  ret = rados.init_with_context(cct);
+  if (ret < 0) {
+    return ret;
+  }
+  ret = rados.connect();
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
+    new RGWCoroutinesManagerRegistry(cct)};
+  ret = crs->hook_to_admin_command("cr dump");
+  if (ret < 0) {
+    return ret;
+  }
+
+  cr_registry = crs.release();
+
+  if (use_datacache) {
+    d3n_data_cache = new D3nDataCache();
+    d3n_data_cache->init(cct);
+  }
+
+  return ret;
+}
+
+int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
+{
+  string name = cct->_conf->name.get_id();
+  if (name.compare(0, 4, "rgw.") == 0) {
+    name = name.substr(4);
+  }
+  map<string,string> metadata = meta;
+  metadata["num_handles"] = "1"s;
+  metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
+  metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
+  metadata["zone_name"] = svc.zone->zone_name();
+  metadata["zone_id"] = svc.zone->zone_id().id;
+  metadata["realm_name"] = svc.zone->get_realm().get_name();
+  metadata["realm_id"] = svc.zone->get_realm().get_id();
+  metadata["id"] = name;
+  int ret = rados.service_daemon_register(
+    daemon_type,
+    stringify(rados.get_instance_id()),
+    metadata);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
+{
+  int ret = rados.service_daemon_update_status(move(status));
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+/** 
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_complete(const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  /* 
+   * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
+   */
+  sync_module = svc.sync_modules->get_sync_module();
+
+  ret = open_root_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_gc_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_lc_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_objexp_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_reshard_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_notif_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  pools_initialized = true;
+
+  if (use_gc) {
+    gc = new RGWGC();
+    gc->initialize(cct, this);
+  } else {
+    ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
+  }
+
+  obj_expirer = new RGWObjectExpirer(this->driver);
+
+  if (use_gc_thread && use_gc) {
+    gc->start_processor();
+    obj_expirer->start_processor();
+  }
+
+  auto& current_period = svc.zone->get_current_period();
+  auto& zonegroup = svc.zone->get_zonegroup();
+  auto& zone_params = svc.zone->get_zone_params();
+  auto& zone = svc.zone->get_zone();
+
+  /* no point of running sync thread if we don't have a master zone configured
+    or there is no rest_master_conn */
+  if (!svc.zone->need_to_sync()) {
+    run_sync_thread = false;
+  }
+
+  if (svc.zone->is_meta_master()) {
+    auto md_log = svc.mdlog->get_log(current_period.get_id());
+    meta_notifier = new RGWMetaNotifier(this, md_log);
+    meta_notifier->start();
+  }
+
+  /* init it anyway, might run sync through radosgw-admin explicitly */
+  sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
+  sync_tracer->init(this);
+  ret = sync_tracer->hook_to_admin_command();
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (run_sync_thread) {
+    for (const auto &pt: zonegroup.placement_targets) {
+      if (zone_params.placement_pools.find(pt.second.name)
+          == zone_params.placement_pools.end()){
+        ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
+                      << pt.second.name << " present in zonegroup" << dendl;
+      }
+    }
+    auto async_processor = svc.rados->get_async_processor();
+    std::lock_guard l{meta_sync_thread_lock};
+    meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor);
+    ret = meta_sync_processor_thread->init(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
+      return ret;
+    }
+    meta_sync_processor_thread->start();
+
+    // configure the bucket trim manager
+    rgw::BucketTrimConfig config;
+    rgw::configure_bucket_trim(cct, config);
+
+    bucket_trim.emplace(this->driver, config);
+    ret = bucket_trim->init();
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
+      return ret;
+    }
+    svc.datalog_rados->set_observer(&*bucket_trim);
+
+    std::lock_guard dl{data_sync_thread_lock};
+    for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
+      ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
+      auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone);
+      ret = thread->init(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
+        return ret;
+      }
+      thread->start();
+      data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
+    }
+    auto interval = cct->_conf->rgw_sync_log_trim_interval;
+    if (interval > 0) {
+      sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval);
+      ret = sync_log_trimmer->init(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
+        return ret;
+      }
+      sync_log_trimmer->start();
+    }
+  }
+  if (cct->_conf->rgw_data_notify_interval_msec) {
+    data_notifier = new RGWDataNotifier(this);
+    data_notifier->start();
+  }
+
+  binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
+  binfo_cache->init(svc.cache);
+
+  lc = new RGWLC();
+  lc->initialize(cct, this->driver);
+
+  if (use_lc_thread)
+    lc->start_processor();
+
+  quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads);
+
+  bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
+                             zone.bucket_index_max_shards);
+  if (bucket_index_max_shards > get_max_bucket_shards()) {
+    bucket_index_max_shards = get_max_bucket_shards();
+    ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
+      << get_max_bucket_shards() << dendl;
+  }
+  ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
+
+  bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
+
+  if (need_tombstone_cache) {
+    obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
+  }
+
+  reshard_wait = std::make_shared<RGWReshardWait>();
+
+  reshard = new RGWReshard(this->driver);
+
+  // disable reshard thread based on zone/zonegroup support
+  run_reshard_thread = run_reshard_thread && svc.zone->can_reshard();
+
+  if (run_reshard_thread)  {
+    reshard->start_processor();
+  }
+
+  index_completion_manager = new RGWIndexCompletionManager(this);
+  ret = rgw::notify::init(cct, driver, dpp);
+  if (ret < 0 ) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
+  }
+
+  return ret;
+}
+
+int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
+{
+  if (raw) {
+    return svc.init_raw(cct, use_cache, null_yield, dpp);
+  }
+
+  return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
+}
+
+int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
+{
+  return ctl.init(&svc, driver, dpp);
+}
+
+/** 
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_begin(const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  inject_notify_timeout_probability =
+    cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
+  max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
+
+  ret = init_svc(false, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
+    return ret;
+  }
+
+  ret = init_ctl(dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
+    return ret;
+  }
+
+  host_id = svc.zone_utils->gen_host_id();
+
+  return init_rados();
+}
+
+/**
+ * Open the pool used as root for this gateway
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
+}
+
+int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
+}
+
+int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
+}
+
+int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
+}
+
+int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
+}
+
+int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
+}
+
+int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+                           bool mostly_omap)
+{
+  constexpr bool create = true; // create the pool if it doesn't exist
+  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
+}
+
+/**** logs ****/
+
+struct log_list_state {
+  string prefix;
+  librados::IoCtx io_ctx;
+  librados::NObjectIterator obit;
+};
+
+int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
+{
+  log_list_state *state = new log_list_state;
+  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+  if (r < 0) {
+    delete state;
+    return r;
+  }
+  state->prefix = prefix;
+  state->obit = state->io_ctx.nobjects_begin();
+  *handle = (RGWAccessHandle)state;
+  return 0;
+}
+
+int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
+{
+  log_list_state *state = static_cast<log_list_state *>(handle);
+  while (true) {
+    if (state->obit == state->io_ctx.nobjects_end()) {
+      delete state;
+      return -ENOENT;
+    }
+    if (state->prefix.length() &&
+       state->obit->get_oid().find(state->prefix) != 0) {
+      state->obit++;
+      continue;
+    }
+    *name = state->obit->get_oid();
+    state->obit++;
+    break;
+  }
+  return 0;
+}
+
+int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
+{
+  librados::IoCtx io_ctx;
+  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
+  if (r < 0)
+    return r;
+  return io_ctx.remove(name);
+}
+
+struct log_show_state {
+  librados::IoCtx io_ctx;
+  bufferlist bl;
+  bufferlist::const_iterator p;
+  string name;
+  uint64_t pos;
+  bool eof;
+  log_show_state() : pos(0), eof(false) {}
+};
+
+int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
+{
+  log_show_state *state = new log_show_state;
+  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+  if (r < 0) {
+    delete state;
+    return r;
+  }
+  state->name = name;
+  *handle = (RGWAccessHandle)state;
+  return 0;
+}
+
+int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
+{
+  log_show_state *state = static_cast<log_show_state *>(handle);
+  off_t off = state->p.get_off();
+
+  ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
+          << " off " << off
+          << " eof " << (int)state->eof
+          << dendl;
+  // read some?
+  unsigned chunk = 1024*1024;
+  if ((state->bl.length() - off) < chunk/2 && !state->eof) {
+    bufferlist more;
+    int r = state->io_ctx.read(state->name, more, chunk, state->pos);
+    if (r < 0)
+      return r;
+    state->pos += r;
+    bufferlist old;
+    try {
+      old.substr_of(state->bl, off, state->bl.length() - off);
+    } catch (buffer::error& err) {
+      return -EINVAL;
+    }
+    state->bl = std::move(old);
+    state->bl.claim_append(more);
+    state->p = state->bl.cbegin();
+    if ((unsigned)r < chunk)
+      state->eof = true;
+    ldpp_dout(dpp, 10) << " read " << r << dendl;
+  }
+
+  if (state->p.end())
+    return 0;  // end of file
+  try {
+    decode(*entry, state->p);
+  }
+  catch (const buffer::error &e) {
+    return -EINVAL;
+  }
+  return 1;
+}
+
+/**
+ * usage_log_hash: get usage log key hash, based on name and index
+ *
+ * Get the usage object name. Since a user may have more than 1
+ * object holding that info (multiple shards), we use index to
+ * specify that shard number. Once index exceeds max shards it
+ * wraps.
+ * If name is not being set, results for all users will be returned
+ * and index will wrap only after total shards number.
+ *
+ * @param cct [in] ceph context
+ * @param name [in] user name
+ * @param hash [out] hash value
+ * @param index [in] shard index number 
+ */
+static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
+{
+  uint32_t val = index;
+
+  if (!name.empty()) {
+    int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
+    val %= max_user_shards;
+    val += ceph_str_hash_linux(name.c_str(), name.size());
+  }
+  char buf[17];
+  int max_shards = cct->_conf->rgw_usage_max_shards;
+  snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
+  hash = buf;
+}
+
+int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+  uint32_t index = 0;
+
+  map<string, rgw_usage_log_info> log_objs;
+
+  string hash;
+  string last_user;
+
+  /* restructure usage map, zone by object hash */
+  map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
+  for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
+    const rgw_user_bucket& ub = iter->first;
+    RGWUsageBatch& info = iter->second;
+
+    if (ub.user.empty()) {
+      ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
+      continue;
+    }
+
+    if (ub.user != last_user) {
+      /* index *should* be random, but why waste extra cycles
+         in most cases max user shards is not going to exceed 1,
+         so just incrementing it */
+      usage_log_hash(cct, ub.user, hash, index++);
+    }
+    last_user = ub.user;
+    vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
+
+    for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
+      v.push_back(miter->second);
+    }
+  }
+
+  map<string, rgw_usage_log_info>::iterator liter;
+
+  for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
+    int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+                         uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
+                        rgw_usage_log_entry>& usage)
+{
+  uint32_t num = max_entries;
+  string hash, first_hash;
+  string user_str = user.to_str();
+  usage_log_hash(cct, user_str, first_hash, 0);
+
+  if (usage_iter.index) {
+    usage_log_hash(cct, user_str, hash, usage_iter.index);
+  } else {
+    hash = first_hash;
+  }
+
+  usage.clear();
+
+  do {
+    map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
+    map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
+
+    int ret =  cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
+                                    usage_iter.read_iter, ret_usage, is_truncated);
+    if (ret == -ENOENT)
+      goto next;
+
+    if (ret < 0)
+      return ret;
+
+    num -= ret_usage.size();
+
+    for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
+      usage[iter->first].aggregate(iter->second);
+    }
+
+next:
+    if (!*is_truncated) {
+      usage_iter.read_iter.clear();
+      usage_log_hash(cct, user_str, hash, ++usage_iter.index);
+    }
+  } while (num && !*is_truncated && hash != first_hash);
+  return 0;
+}
+
+int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
+{
+  uint32_t index = 0;
+  string hash, first_hash;
+  string user_str = user.to_str();
+  usage_log_hash(cct, user_str, first_hash, index);
+
+  hash = first_hash;
+  do {
+    int ret =  cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
+
+    if (ret < 0 && ret != -ENOENT)
+      return ret;
+
+    usage_log_hash(cct, user_str, hash, ++index);
+  } while (hash != first_hash);
+
+  return 0;
+}
+
+
+int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
+{
+  auto max_shards = cct->_conf->rgw_usage_max_shards;
+  int ret=0;
+  for (unsigned i=0; i < max_shards; i++){
+    string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
+    ret = cls_obj_usage_log_clear(dpp, oid);
+    if (ret < 0){
+      ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
+      return ret;
+    }
+  }
+  return ret;
+}
+
+int RGWRados::decode_policy(const DoutPrefixProvider *dpp,
+                           ceph::buffer::list& bl,
+                           ACLOwner *owner)
+{
+  auto i = bl.cbegin();
+  RGWAccessControlPolicy policy(cct);
+  try {
+    policy.decode_owner(i);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  *owner = policy.get_owner();
+  return 0;
+}
+
+int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
+{
+  rgw_bucket bucket = bucket_info.bucket;
+  bucket.update_bucket_id(new_bucket_id);
+
+  bucket_info.objv_tracker.clear();
+  int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+
+/**
+ * Get ordered listing of the objects in a bucket.
+ *
+ * max_p: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: do not include results that match this string.
+ *     Any skipped results will have the matching portion of their name
+ *     inserted in common_prefixes with a "true" mark.
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: if delim is filled in, any matching prefixes are
+ * placed here.
+ * is_truncated: if number of objects in the bucket is bigger than
+ * max, then truncated.
+ */
+int RGWRados::Bucket::List::list_objects_ordered(
+  const DoutPrefixProvider *dpp,
+  int64_t max_p,
+  std::vector<rgw_bucket_dir_entry> *result,
+  std::map<std::string, bool> *common_prefixes,
+  bool *is_truncated,
+  optional_yield y)
+{
+  RGWRados *store = target->get_store();
+  CephContext *cct = store->ctx();
+  int shard_id = target->get_shard_id();
+  const auto& current_index = target->get_bucket_info().layout.current_index;
+
+  int count = 0;
+  bool truncated = true;
+  bool cls_filtered = false;
+  const int64_t max = // protect against memory issues and negative vals
+    std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+  int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
+
+  result->clear();
+
+  // use a local marker; either the marker will have a previous entry
+  // or it will be empty; either way it's OK to copy
+  rgw_obj_key marker_obj(params.marker.name,
+                        params.marker.instance,
+                        params.ns.empty() ? params.marker.ns : params.ns);
+  rgw_obj_index_key cur_marker;
+  marker_obj.get_index_key(&cur_marker);
+
+  rgw_obj_key end_marker_obj(params.end_marker.name,
+                            params.end_marker.instance,
+                            params.ns.empty() ? params.end_marker.ns : params.ns);
+  rgw_obj_index_key cur_end_marker;
+  end_marker_obj.get_index_key(&cur_end_marker);
+  const bool cur_end_marker_valid = !params.end_marker.empty();
+
+  rgw_obj_key prefix_obj(params.prefix);
+  prefix_obj.set_ns(params.ns);
+  std::string cur_prefix = prefix_obj.get_index_key_name();
+  std::string after_delim_s; /* needed in !params.delim.empty() AND later */
+
+  if (!params.delim.empty()) {
+    after_delim_s = cls_rgw_after_delim(params.delim);
+    /* if marker points at a common prefix, fast forward it into its
+     * upper bound string */
+    int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
+    if (delim_pos >= 0) {
+      string s = cur_marker.name.substr(0, delim_pos);
+      s.append(after_delim_s);
+      cur_marker = s;
+    }
+  }
+
+  // we'll stop after this many attempts as long we return at least
+  // one entry; but we will also go beyond this number of attempts
+  // until we return at least one entry
+  constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
+
+  rgw_obj_index_key prev_marker;
+  for (uint16_t attempt = 1; /* empty */; ++attempt) {
+    ldpp_dout(dpp, 20) << __func__ <<
+      ": starting attempt " << attempt << dendl;
+
+    if (attempt > 1 && !(prev_marker < cur_marker)) {
+      // we've failed to make forward progress
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+       " marker failed to make forward progress; attempt=" << attempt <<
+       ", prev_marker=" << prev_marker <<
+       ", cur_marker=" << cur_marker << dendl;
+      break;
+    }
+    prev_marker = cur_marker;
+
+    ent_map_t ent_map;
+    ent_map.reserve(read_ahead);
+    int r = store->cls_bucket_list_ordered(dpp,
+                                           target->get_bucket_info(),
+                                           current_index,
+                                           shard_id,
+                                          cur_marker,
+                                          cur_prefix,
+                                          params.delim,
+                                          read_ahead + 1 - count,
+                                          params.list_versions,
+                                          attempt,
+                                          ent_map,
+                                          &truncated,
+                                          &cls_filtered,
+                                          &cur_marker,
+                                           y,
+                                          params.force_check_filter);
+    if (r < 0) {
+      return r;
+    }
+
+    for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
+      rgw_bucket_dir_entry& entry = eiter->second;
+      rgw_obj_index_key index_key = entry.key;
+      rgw_obj_key obj(index_key);
+
+      ldpp_dout(dpp, 20) << __func__ <<
+       ": considering entry " << entry.key << dendl;
+
+      /* note that parse_raw_oid() here will not set the correct
+       * object's instance, as rgw_obj_index_key encodes that
+       * separately. We don't need to set the instance because it's
+       * not needed for the checks here and we end up using the raw
+       * entry for the return vector
+       */
+      bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+      if (!valid) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+         " could not parse object name: " << obj.name << dendl;
+        continue;
+      }
+
+      bool matched_ns = (obj.ns == params.ns);
+      if (!params.list_versions && !entry.is_visible()) {
+        ldpp_dout(dpp, 10) << __func__ <<
+         ": skipping not visible entry \"" << entry.key << "\"" << dendl;
+        continue;
+      }
+
+      if (params.enforce_ns && !matched_ns) {
+        if (!params.ns.empty()) {
+          /* we've iterated past the namespace we're searching -- done now */
+          truncated = false;
+         ldpp_dout(dpp, 10) << __func__ <<
+           ": finished due to getting past requested namespace \"" <<
+           params.ns << "\"" << dendl;
+          goto done;
+        }
+
+        /* we're skipping past namespaced objects */
+       ldpp_dout(dpp, 20) << __func__ <<
+         ": skipping past namespaced objects, including \"" << entry.key <<
+         "\"" << dendl;
+        continue;
+      }
+
+      if (cur_end_marker_valid && cur_end_marker <= index_key) {
+        truncated = false;
+       ldpp_dout(dpp, 10) << __func__ <<
+         ": finished due to gitting end marker of \"" << cur_end_marker <<
+         "\" with \"" << entry.key << "\"" << dendl;
+        goto done;
+      }
+
+      if (count < max) {
+       params.marker = index_key;
+       next_marker = index_key;
+      }
+
+      if (params.access_list_filter &&
+         ! params.access_list_filter->filter(obj.name, index_key.name)) {
+       ldpp_dout(dpp, 20) << __func__ <<
+         ": skipping past namespaced objects, including \"" << entry.key <<
+         "\"" << dendl;
+        continue;
+      }
+
+      if (params.prefix.size() &&
+         0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
+       ldpp_dout(dpp, 20) << __func__ <<
+         ": skipping object \"" << entry.key <<
+         "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
+        continue;
+      }
+
+      if (!params.delim.empty()) {
+       const int delim_pos = obj.name.find(params.delim, params.prefix.size());
+       if (delim_pos >= 0) {
+         // run either the code where delimiter filtering is done a)
+         // in the OSD/CLS or b) here.
+         if (cls_filtered) {
+           // NOTE: this condition is for the newer versions of the
+           // OSD that does filtering on the CLS side should only
+           // find one delimiter at the end if it finds any after the
+           // prefix
+           if (delim_pos !=
+               int(obj.name.length() - params.delim.length())) {
+             ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+               " found delimiter in place other than the end of "
+               "the prefix; obj.name=" << obj.name <<
+               ", prefix=" << params.prefix << dendl;
+           }
+           if (common_prefixes) {
+             if (count >= max) {
+               truncated = true;
+               ldpp_dout(dpp, 10) << __func__ <<
+                 ": stopping early with common prefix \"" << entry.key <<
+                 "\" because requested number (" << max <<
+                 ") reached (cls filtered)" << dendl;
+               goto done;
+             }
+
+             (*common_prefixes)[obj.name] = true;
+             count++;
+           }
+
+           ldpp_dout(dpp, 20) << __func__ <<
+             ": finished entry with common prefix \"" << entry.key <<
+             "\" so continuing loop (cls filtered)" << dendl;
+           continue;
+         } else {
+           // NOTE: this condition is for older versions of the OSD
+           // that do not filter on the CLS side, so the following code
+           // must do the filtering; once we reach version 16 of ceph,
+           // this code can be removed along with the conditional that
+           // can lead this way
+
+           /* extract key -with trailing delimiter- for CommonPrefix */
+           string prefix_key =
+             obj.name.substr(0, delim_pos + params.delim.length());
+
+           if (common_prefixes &&
+               common_prefixes->find(prefix_key) == common_prefixes->end()) {
+             if (count >= max) {
+               truncated = true;
+               ldpp_dout(dpp, 10) << __func__ <<
+                 ": stopping early with common prefix \"" << entry.key <<
+                 "\" because requested number (" << max <<
+                 ") reached (not cls filtered)" << dendl;
+               goto done;
+             }
+             next_marker = prefix_key;
+             (*common_prefixes)[prefix_key] = true;
+
+             count++;
+           }
+
+           ldpp_dout(dpp, 20) << __func__ <<
+             ": finished entry with common prefix \"" << entry.key <<
+             "\" so continuing loop (not cls filtered)" << dendl;
+           continue;
+         } // if we're running an older OSD version
+       } // if a delimiter was found after prefix
+      } // if a delimiter was passed in
+
+      if (count >= max) {
+        truncated = true;
+       ldpp_dout(dpp, 10) << __func__ <<
+         ": stopping early with entry \"" << entry.key <<
+         "\" because requested number (" << max <<
+         ") reached" << dendl;
+        goto done;
+      }
+
+      ldpp_dout(dpp, 20) << __func__ <<
+       ": adding entry " << entry.key << " to result" << dendl;
+
+      result->emplace_back(std::move(entry));
+      count++;
+    } // eiter for loop
+
+    // NOTE: the following conditional is needed by older versions of
+    // the OSD that don't do delimiter filtering on the CLS side; once
+    // we reach version 16 of ceph, the following conditional and the
+    // code within can be removed
+    if (!cls_filtered && !params.delim.empty()) {
+      int marker_delim_pos =
+       cur_marker.name.find(params.delim, cur_prefix.size());
+      if (marker_delim_pos >= 0) {
+       std::string skip_after_delim =
+         cur_marker.name.substr(0, marker_delim_pos);
+        skip_after_delim.append(after_delim_s);
+
+        ldpp_dout(dpp, 20) << __func__ <<
+         ": skip_after_delim=" << skip_after_delim << dendl;
+
+        if (skip_after_delim > cur_marker.name) {
+          cur_marker = skip_after_delim;
+          ldpp_dout(dpp, 20) << __func__ <<
+           ": setting cur_marker=" << cur_marker.name <<
+           "[" << cur_marker.instance << "]" << dendl;
+        }
+      }
+    } // if older osd didn't do delimiter filtering
+
+    ldpp_dout(dpp, 10) << __func__ <<
+      ": end of outer loop, truncated=" << truncated <<
+      ", count=" << count << ", attempt=" << attempt << dendl;
+
+    if (!truncated || count >= (max + 1) / 2) {
+      // if we finished listing, or if we're returning at least half the
+      // requested entries, that's enough; S3 and swift protocols allow
+      // returning fewer than max entries
+      ldpp_dout(dpp, 10) << __func__ <<
+       ": exiting attempt loop because we reached end (" << truncated <<
+       ") or we're returning half the requested entries (" << count <<
+       " of " << max << ")" << dendl;
+      break;
+    } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
+      // if we've made at least 8 attempts and we have some, but very
+      // few, results, return with what we have
+      ldpp_dout(dpp, 10) << __func__ <<
+       ": exiting attempt loop because we made " << attempt <<
+       " attempts and we're returning " << count << " entries" << dendl;
+      break;
+    }
+  } // for (uint16_t attempt...
+
+done:
+
+  if (is_truncated) {
+    *is_truncated = truncated;
+  }
+
+  return 0;
+} // list_objects_ordered
+
+
+/**
+ * Get listing of the objects in a bucket and allow the results to be out
+ * of order.
+ *
+ * Even though there are key differences with the ordered counterpart,
+ * the parameters are the same to maintain some compatability.
+ *
+ * max: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: should not be set; if it is we should have indicated an error
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: this is never filled with an unordered list; the param
+ *                  is maintained for compatibility
+ * is_truncated: if number of objects in the bucket is bigger than max, then
+ *               truncated.
+ */
+int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
+                                                   int64_t max_p,
+                                                  std::vector<rgw_bucket_dir_entry>* result,
+                                                  std::map<std::string, bool>* common_prefixes,
+                                                  bool* is_truncated,
+                                                   optional_yield y)
+{
+  RGWRados *store = target->get_store();
+  int shard_id = target->get_shard_id();
+  const auto& current_index = target->get_bucket_info().layout.current_index;
+
+  int count = 0;
+  bool truncated = true;
+
+  const int64_t max = // protect against memory issues and negative vals
+    std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+
+  // read a few extra in each call to cls_bucket_list_unordered in
+  // case some are filtered out due to namespace matching, versioning,
+  // filtering, etc.
+  const int64_t max_read_ahead = 100;
+  const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
+
+  result->clear();
+
+  // use a local marker; either the marker will have a previous entry
+  // or it will be empty; either way it's OK to copy
+  rgw_obj_key marker_obj(params.marker.name,
+                        params.marker.instance,
+                        params.ns.empty() ? params.marker.ns : params.ns);
+  rgw_obj_index_key cur_marker;
+  marker_obj.get_index_key(&cur_marker);
+
+  rgw_obj_key end_marker_obj(params.end_marker.name,
+                            params.end_marker.instance,
+                            params.ns.empty() ? params.end_marker.ns : params.ns);
+  rgw_obj_index_key cur_end_marker;
+  end_marker_obj.get_index_key(&cur_end_marker);
+  const bool cur_end_marker_valid = !params.end_marker.empty();
+
+  rgw_obj_key prefix_obj(params.prefix);
+  prefix_obj.set_ns(params.ns);
+  std::string cur_prefix = prefix_obj.get_index_key_name();
+
+  while (truncated && count <= max) {
+    std::vector<rgw_bucket_dir_entry> ent_list;
+    ent_list.reserve(read_ahead);
+
+    int r = store->cls_bucket_list_unordered(dpp,
+                                             target->get_bucket_info(),
+                                             current_index,
+                                             shard_id,
+                                            cur_marker,
+                                            cur_prefix,
+                                            read_ahead,
+                                            params.list_versions,
+                                            ent_list,
+                                            &truncated,
+                                            &cur_marker,
+                                             y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+       " cls_bucket_list_unordered returned " << r << " for " <<
+       target->get_bucket_info().bucket << dendl;
+      return r;
+    }
+
+    // NB: while regions of ent_list will be sorted, we have no
+    // guarantee that all items will be sorted since they can cross
+    // shard boundaries
+
+    for (auto& entry : ent_list) {
+      rgw_obj_index_key index_key = entry.key;
+      rgw_obj_key obj(index_key);
+
+      if (count < max) {
+       params.marker.set(index_key);
+       next_marker.set(index_key);
+      }
+
+      /* note that parse_raw_oid() here will not set the correct
+       * object's instance, as rgw_obj_index_key encodes that
+       * separately. We don't need to set the instance because it's
+       * not needed for the checks here and we end up using the raw
+       * entry for the return vector
+       */
+      bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+      if (!valid) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+         " could not parse object name: " << obj.name << dendl;
+        continue;
+      }
+
+      if (!params.list_versions && !entry.is_visible()) {
+        ldpp_dout(dpp, 20) << __func__ <<
+         ": skippping \"" << index_key <<
+         "\" because not listing versions and entry not visibile" << dendl;
+        continue;
+      }
+
+      if (params.enforce_ns && obj.ns != params.ns) {
+        ldpp_dout(dpp, 20) << __func__ <<
+         ": skippping \"" << index_key <<
+         "\" because namespace does not match" << dendl;
+        continue;
+      }
+
+      if (cur_end_marker_valid && cur_end_marker <= index_key) {
+       // we're not guaranteed items will come in order, so we have
+       // to loop through all
+        ldpp_dout(dpp, 20) << __func__ <<
+         ": skippping \"" << index_key <<
+         "\" because after end_marker" << dendl;
+       continue;
+      }
+
+      if (params.access_list_filter &&
+         !params.access_list_filter->filter(obj.name, index_key.name)) {
+        ldpp_dout(dpp, 20) << __func__ <<
+         ": skippping \"" << index_key <<
+         "\" because doesn't match filter" << dendl;
+        continue;
+      }
+
+      if (params.prefix.size() &&
+         (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
+        ldpp_dout(dpp, 20) << __func__ <<
+         ": skippping \"" << index_key <<
+         "\" because doesn't match prefix" << dendl;
+       continue;
+      }
+
+      if (count >= max) {
+        truncated = true;
+        goto done;
+      }
+
+      result->emplace_back(std::move(entry));
+      count++;
+    } // for (auto& entry : ent_list)
+  } // while (truncated && count <= max)
+
+done:
+
+  if (is_truncated) {
+    *is_truncated = truncated;
+  }
+
+  return 0;
+} // list_objects_unordered
+
+
+/**
+ * create a rados pool, associated meta info
+ * returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
+{
+  librados::IoCtx io_ctx;
+  constexpr bool create = true;
+  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
+}
+
+void RGWRados::create_bucket_id(string *bucket_id)
+{
+  uint64_t iid = instance_id();
+  uint64_t bid = next_bucket_id();
+  char buf[svc.zone->get_zone_params().get_id().size() + 48];
+  snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
+           svc.zone->get_zone_params().get_id().c_str(), iid, bid);
+  *bucket_id = buf;
+}
+
+int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+                            const string& zonegroup_id,
+                            const rgw_placement_rule& placement_rule,
+                            const string& swift_ver_location,
+                            const RGWQuotaInfo * pquota_info,
+                           map<std::string, bufferlist>& attrs,
+                            RGWBucketInfo& info,
+                            obj_version *pobjv,
+                            obj_version *pep_objv,
+                            real_time creation_time,
+                            rgw_bucket *pmaster_bucket,
+                            uint32_t *pmaster_num_shards,
+                           optional_yield y,
+                            const DoutPrefixProvider *dpp,
+                           bool exclusive)
+{
+#define MAX_CREATE_RETRIES 20 /* need to bound retries */
+  rgw_placement_rule selected_placement_rule;
+  RGWZonePlacementInfo rule_info;
+
+  for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
+    int ret = 0;
+    ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
+                                            &selected_placement_rule, &rule_info, y);
+    if (ret < 0)
+      return ret;
+
+    if (!pmaster_bucket) {
+      create_bucket_id(&bucket.marker);
+      bucket.bucket_id = bucket.marker;
+    } else {
+      bucket.marker = pmaster_bucket->marker;
+      bucket.bucket_id = pmaster_bucket->bucket_id;
+    }
+
+    RGWObjVersionTracker& objv_tracker = info.objv_tracker;
+
+    objv_tracker.read_version.clear();
+
+    if (pobjv) {
+      objv_tracker.write_version = *pobjv;
+    } else {
+      objv_tracker.generate_new_write_ver(cct);
+    }
+
+    info.bucket = bucket;
+    info.owner = owner.user_id;
+    info.zonegroup = zonegroup_id;
+    info.placement_rule = selected_placement_rule;
+    info.swift_ver_location = swift_ver_location;
+    info.swift_versioning = (!swift_ver_location.empty());
+
+    init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
+                              pmaster_num_shards ?
+                              std::optional{*pmaster_num_shards} :
+                              std::nullopt,
+                              rule_info.index_type);
+
+    info.requester_pays = false;
+    if (real_clock::is_zero(creation_time)) {
+      info.creation_time = ceph::real_clock::now();
+    } else {
+      info.creation_time = creation_time;
+    }
+    if (pquota_info) {
+      info.quota = *pquota_info;
+    }
+
+    int r = svc.bi->init_index(dpp, info, info.layout.current_index);
+    if (r < 0) {
+      return r;
+    }
+
+    ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
+    if (ret == -ECANCELED) {
+      ret = -EEXIST;
+    }
+    if (ret == -EEXIST) {
+       /* we need to reread the info and return it, caller will have a use for it */
+      RGWBucketInfo orig_info;
+      r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
+      if (r < 0) {
+        if (r == -ENOENT) {
+          continue;
+        }
+        ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
+        return r;
+      }
+
+      /* only remove it if it's a different bucket instance */
+      if (orig_info.bucket.bucket_id != bucket.bucket_id) {
+       int r = svc.bi->clean_index(dpp, info, info.layout.current_index);
+       if (r < 0) {
+         ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
+       }
+        r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
+        if (r < 0) {
+          ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
+          /* continue anyway */
+        }
+      }
+
+      info = std::move(orig_info);
+      /* ret == -EEXIST here */
+    }
+    return ret;
+  }
+
+  /* this is highly unlikely */
+  ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
+  return -ENOENT;
+}
+
+bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+  get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+  return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
+}
+
+std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return svc.rados->cluster_fsid();
+}
+
+int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
+                                const RGWBucketInfo& bucket_info,
+                                const rgw_obj& obj,
+                                librados::IoCtx *ioctx)
+{
+  std::string oid, key;
+  get_obj_bucket_and_oid_loc(obj, oid, key);
+
+  rgw_pool pool;
+  if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
+      ", probably misconfiguration" << dendl;
+    return -EIO;
+  }
+
+  int r = open_pool_ctx(dpp, pool, *ioctx, false);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
+      " for obj=" << obj << " with error-code=" << r << dendl;
+    return r;
+  }
+
+  ioctx->locator_set_key(key);
+
+  return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+                               const rgw_placement_rule& target_placement_rule,
+                               const rgw_obj& obj,
+                               rgw_rados_ref *ref)
+{
+  get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
+
+  rgw_pool pool;
+  if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
+    return -EIO;
+  }
+
+  ref->pool = svc.rados->pool(pool);
+
+  int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+                         .set_mostly_omap(false));
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
+    return r;
+  }
+
+  ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+  return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+                               const RGWBucketInfo& bucket_info,
+                               const rgw_obj& obj,
+                               rgw_rados_ref *ref)
+{
+  return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
+}
+
+int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+  ref->obj = obj;
+
+  if (ref->obj.oid.empty()) {
+    ref->obj.oid = obj.pool.to_str();
+    ref->obj.pool = svc.zone->get_zone_params().domain_root;
+  }
+  ref->pool = svc.rados->pool(obj.pool);
+  int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+                         .set_mostly_omap(false));
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
+    return r;
+  }
+
+  ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+  return 0;
+}
+
+int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+  return get_raw_obj_ref(dpp, obj, ref);
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  string oid;
+  string locator;
+
+  rgw_obj obj(bucket, key);
+
+  get_obj_bucket_and_oid_loc(obj, oid, locator);
+
+  if (locator.empty()) {
+    ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
+    return 0;
+  }
+
+  librados::IoCtx ioctx;
+
+  int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
+  if (ret < 0) {
+    cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
+    return ret;
+  }
+  ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
+
+  uint64_t size;
+  bufferlist data;
+
+  struct timespec mtime_ts;
+  map<string, bufferlist> attrs;
+  librados::ObjectReadOperation op;
+  op.getxattrs(&attrs, NULL);
+  op.stat2(&size, &mtime_ts, NULL);
+#define HEAD_SIZE 512 * 1024
+  op.read(0, HEAD_SIZE, &data, NULL);
+
+  ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (size > HEAD_SIZE) {
+    ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
+    return -EIO;
+  }
+
+  if (size != data.length()) {
+    ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
+    return -EIO;
+  }
+
+  if (copy_obj) {
+    librados::ObjectWriteOperation wop;
+
+    wop.mtime2(&mtime_ts);
+
+    map<string, bufferlist>::iterator iter;
+    for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+      wop.setxattr(iter->first.c_str(), iter->second);
+    }
+
+    wop.write(0, data);
+
+    ioctx.locator_set_key(locator);
+    rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
+  }
+
+  if (remove_bad) {
+    ioctx.locator_set_key(string());
+
+    ret = ioctx.remove(oid);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
+                             librados::IoCtx& src_ioctx,
+                            const string& src_oid, const string& src_locator,
+                            librados::IoCtx& dst_ioctx,
+                            const string& dst_oid, const string& dst_locator)
+{
+
+#define COPY_BUF_SIZE (4 * 1024 * 1024)
+  bool done = false;
+  uint64_t chunk_size = COPY_BUF_SIZE;
+  uint64_t ofs = 0;
+  int ret = 0;
+  real_time mtime;
+  struct timespec mtime_ts;
+  uint64_t size;
+
+  if (src_oid == dst_oid && src_locator == dst_locator) {
+    return 0;
+  }
+
+  src_ioctx.locator_set_key(src_locator);
+  dst_ioctx.locator_set_key(dst_locator);
+
+  do {
+    bufferlist data;
+    ObjectReadOperation rop;
+    ObjectWriteOperation wop;
+
+    if (ofs == 0) {
+      rop.stat2(&size, &mtime_ts, NULL);
+      mtime = real_clock::from_timespec(mtime_ts);
+    }
+    rop.read(ofs, chunk_size, &data, NULL);
+    ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
+    if (ret < 0) {
+      goto done_err;
+    }
+
+    if (data.length() == 0) {
+      break;
+    }
+
+    if (ofs == 0) {
+      wop.create(true); /* make it exclusive */
+      wop.mtime2(&mtime_ts);
+      mtime = real_clock::from_timespec(mtime_ts);
+    }
+    wop.write(ofs, data);
+    ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
+    if (ret < 0) {
+      goto done_err;
+    }
+    ofs += data.length();
+    done = data.length() != chunk_size;
+  } while (!done);
+
+  if (ofs != size) {
+    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
+               << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
+    ret = -EIO;
+    goto done_err;
+  }
+
+  src_ioctx.remove(src_oid);
+
+  return 0;
+
+done_err:
+  // TODO: clean up dst_oid if we created it
+  ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
+  return ret;
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp,
+                                   RGWBucketInfo& bucket_info, rgw_obj_key& key,
+                                   bool fix, bool *need_fix, optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  driver->get_bucket(nullptr, bucket_info, &bucket);
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+
+  if (need_fix) {
+    *need_fix = false;
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWObjState *astate = nullptr;
+  RGWObjManifest* manifest = nullptr;
+  RGWObjectCtx rctx(this->driver);
+  r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  if (manifest) {
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+      rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(driver);
+      rgw_obj loc;
+      string oid;
+      string locator;
+
+      RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc);
+
+      if (loc.key.ns.empty()) {
+       /* continue, we're only interested in tail objects */
+       continue;
+      }
+
+      auto& ioctx = ref.pool.ioctx();
+
+      get_obj_bucket_and_oid_loc(loc, oid, locator);
+      ref.pool.ioctx().locator_set_key(locator);
+
+      ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
+
+      r = ioctx.stat(oid, NULL, NULL);
+      if (r != -ENOENT) {
+       continue;
+      }
+
+      string bad_loc;
+      prepend_bucket_marker(bucket->get_key(), loc.key.name, bad_loc);
+
+      /* create a new ioctx with the bad locator */
+      librados::IoCtx src_ioctx;
+      src_ioctx.dup(ioctx);
+      src_ioctx.locator_set_key(bad_loc);
+
+      r = src_ioctx.stat(oid, NULL, NULL);
+      if (r != 0) {
+       /* cannot find a broken part */
+       continue;
+      }
+      ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
+      if (need_fix) {
+        *need_fix = true;
+      }
+      if (fix) {
+        r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
+        if (r < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+                               const rgw_obj& obj,
+                               RGWBucketInfo* bucket_info_out,
+                                const DoutPrefixProvider *dpp)
+{
+  bucket = _bucket;
+
+  RGWBucketInfo bucket_info;
+  RGWBucketInfo* bucket_info_p =
+    bucket_info_out ? bucket_info_out : &bucket_info;
+
+  int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  string oid;
+
+  ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
+
+  return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+                                const rgw_obj& obj)
+{
+  bucket = bucket_info.bucket;
+
+  int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
+                                                        obj.get_hash_object(),
+                                                        &bucket_obj,
+                                                        &shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+  return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp,
+                               const RGWBucketInfo& bucket_info,
+                                const rgw::bucket_index_layout_generation& index,
+                                int sid)
+{
+  bucket = bucket_info.bucket;
+  shard_id = sid;
+
+  int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id,
+                                                         num_shards(index), index.gen,
+                                                         &bucket_obj);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+  return 0;
+}
+
+
+/* Execute @handler on last item in bucket listing for bucket specified
+ * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
+ * to objects matching these criterias. */
+int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+                                       RGWBucketInfo& bucket_info,
+                                       const std::string& obj_prefix,
+                                       const std::string& obj_delim,
+                                       std::function<int(const rgw_bucket_dir_entry&)> handler)
+{
+  RGWRados::Bucket target(this, bucket_info);
+  RGWRados::Bucket::List list_op(&target);
+
+  list_op.params.prefix = obj_prefix;
+  list_op.params.delim = obj_delim;
+
+  ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
+                 << ", obj_prefix=" << obj_prefix
+                 << ", obj_delim=" << obj_delim
+                 << dendl;
+
+  bool is_truncated = false;
+
+  boost::optional<rgw_bucket_dir_entry> last_entry;
+  /* We need to rewind to the last object in a listing. */
+  do {
+    /* List bucket entries in chunks. */
+    static constexpr int MAX_LIST_OBJS = 100;
+    std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
+
+    int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
+                                   &is_truncated, null_yield);
+    if (ret < 0) {
+      return ret;
+    } else if (!entries.empty()) {
+      last_entry = entries.back();
+    }
+  } while (is_truncated);
+
+  if (last_entry) {
+    return handler(*last_entry);
+  }
+
+  /* Empty listing - no items we can run handler on. */
+  return 0;
+}
+
+bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const
+{
+  return bucket->get_info().has_swift_versioning() &&
+    bucket->get_info().swift_ver_location.size();
+}
+
+int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
+                                    const rgw_user& user,
+                                    rgw::sal::Bucket* bucket,
+                                    rgw::sal::Object* obj,
+                                    const DoutPrefixProvider *dpp,
+                                    optional_yield y)
+{
+  if (! swift_versioning_enabled(bucket)) {
+    return 0;
+  }
+
+  obj->set_atomic();
+
+  RGWObjState * state = nullptr;
+  RGWObjManifest *manifest = nullptr;
+  int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj, &state, &manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!state->exists) {
+    return 0;
+  }
+
+  const string& src_name = obj->get_oid();
+  char buf[src_name.size() + 32];
+  struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
+  snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
+           src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
+
+  RGWBucketInfo dest_bucket_info;
+
+  r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
+    if (r == -ENOENT) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+    return r;
+  }
+
+  if (dest_bucket_info.owner != bucket->get_info().owner) {
+    return -ERR_PRECONDITION_FAILED;
+  }
+
+  rgw::sal::RadosBucket dest_bucket(driver, dest_bucket_info);
+  rgw::sal::RadosObject dest_obj(driver, rgw_obj_key(buf), &dest_bucket);
+
+  if (dest_bucket_info.versioning_enabled()){
+    dest_obj.gen_rand_obj_instance_name();
+  }
+
+  dest_obj.set_atomic();
+
+  rgw_zone_id no_zone;
+
+  r = copy_obj(obj_ctx,
+               user,
+               NULL, /* req_info *info */
+               no_zone,
+               &dest_obj,
+               obj,
+               &dest_bucket,
+               bucket,
+               bucket->get_placement_rule(),
+               NULL, /* time_t *src_mtime */
+               NULL, /* time_t *mtime */
+               NULL, /* const time_t *mod_ptr */
+               NULL, /* const time_t *unmod_ptr */
+               false, /* bool high_precision_time */
+               NULL, /* const char *if_match */
+               NULL, /* const char *if_nomatch */
+               RGWRados::ATTRSMOD_NONE,
+               true, /* bool copy_if_newer */
+               state->attrset,
+               RGWObjCategory::Main,
+               0, /* uint64_t olh_epoch */
+               real_time(), /* time_t delete_at */
+               NULL, /* string *version_id */
+               NULL, /* string *ptag */
+               NULL, /* string *petag */
+               NULL, /* void (*progress_cb)(off_t, void *) */
+               NULL, /* void *progress_data */
+               dpp,
+               null_yield);
+  if (r == -ECANCELED || r == -ENOENT) {
+    /* Has already been overwritten, meaning another rgw process already
+     * copied it out */
+    return 0;
+  }
+
+  return r;
+}
+
+int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
+                                       const rgw_user& user,
+                                       rgw::sal::Bucket* bucket,
+                                       rgw::sal::Object* obj,
+                                       bool& restored,                  /* out */
+                                       const DoutPrefixProvider *dpp)
+{
+  if (! swift_versioning_enabled(bucket)) {
+    return 0;
+  }
+
+  /* Bucket info of the bucket that stores previous versions of our object. */
+  RGWBucketInfo archive_binfo;
+
+  int ret = get_bucket_info(&svc, bucket->get_tenant(),
+                            bucket->get_info().swift_ver_location,
+                           archive_binfo, nullptr, null_yield, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* Abort the operation if the bucket storing our archive belongs to someone
+   * else. This is a limitation in comparison to Swift as we aren't taking ACLs
+   * into consideration. For we can live with that.
+   *
+   * TODO: delegate this check to un upper layer and compare with ACLs. */
+  if (bucket->get_info().owner != archive_binfo.owner) {
+    return -EPERM;
+  }
+
+  /* This code will be executed on latest version of the object. */
+  const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
+    rgw_zone_id no_zone;
+
+    /* We don't support object versioning of Swift API on those buckets that
+     * are already versioned using the S3 mechanism. This affects also bucket
+     * storing archived objects. Otherwise the delete operation would create
+     * a deletion marker. */
+    if (archive_binfo.versioned()) {
+      restored = false;
+      return -ERR_PRECONDITION_FAILED;
+    }
+
+    /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
+     * irrelevant and may be safely skipped. */
+    std::map<std::string, ceph::bufferlist> no_attrs;
+
+    rgw::sal::RadosBucket archive_bucket(driver, archive_binfo);
+    rgw::sal::RadosObject archive_obj(driver, entry.key, &archive_bucket);
+
+    if (bucket->versioning_enabled()){
+      obj->gen_rand_obj_instance_name();
+    }
+
+    archive_obj.set_atomic();
+    obj->set_atomic();
+
+    int ret = copy_obj(obj_ctx,
+                       user,
+                       nullptr,       /* req_info *info */
+                       no_zone,
+                       obj,           /* dest obj */
+                       &archive_obj,   /* src obj */
+                       bucket,   /* dest bucket info */
+                       &archive_bucket, /* src bucket info */
+                       bucket->get_placement_rule(),  /* placement_rule */
+                       nullptr,       /* time_t *src_mtime */
+                       nullptr,       /* time_t *mtime */
+                       nullptr,       /* const time_t *mod_ptr */
+                       nullptr,       /* const time_t *unmod_ptr */
+                       false,         /* bool high_precision_time */
+                       nullptr,       /* const char *if_match */
+                       nullptr,       /* const char *if_nomatch */
+                       RGWRados::ATTRSMOD_NONE,
+                       true,          /* bool copy_if_newer */
+                       no_attrs,
+                       RGWObjCategory::Main,
+                       0,             /* uint64_t olh_epoch */
+                       real_time(),   /* time_t delete_at */
+                       nullptr,       /* string *version_id */
+                       nullptr,       /* string *ptag */
+                       nullptr,       /* string *petag */
+                       nullptr,       /* void (*progress_cb)(off_t, void *) */
+                       nullptr,       /* void *progress_data */
+                       dpp,
+                       null_yield);
+    if (ret == -ECANCELED || ret == -ENOENT) {
+      /* Has already been overwritten, meaning another rgw process already
+       * copied it out */
+      return 0;
+    } else if (ret < 0) {
+      return ret;
+    } else {
+      restored = true;
+    }
+
+    /* Need to remove the archived copy. */
+    ret = delete_obj(dpp, archive_binfo, &archive_obj,
+                     archive_binfo.versioning_status());
+
+    return ret;
+  };
+
+  const std::string& obj_name = obj->get_oid();
+  const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
+                                                         % obj_name);
+
+  return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
+                                  handler);
+}
+
+int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
+                                           uint64_t size, uint64_t accounted_size,
+                                           map<string, bufferlist>& attrs,
+                                           bool assume_noent, bool modify_tail,
+                                           void *_index_op, optional_yield y)
+{
+  RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
+  RGWRados *store = target->get_store();
+
+  ObjectWriteOperation op;
+#ifdef WITH_LTTNG
+  const req_state* s =  get_req_state();
+  string req_id;
+  if (!s) {
+    // fake req_id
+    req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
+  } else {
+    req_id = s->req_id;
+  }
+#endif
+
+  RGWObjState *state;
+  RGWObjManifest *manifest = nullptr;
+  int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent);
+  if (r < 0)
+    return r;
+
+  rgw_obj obj = target->get_obj();
+
+  if (obj.get_oid().empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
+    return -EIO;
+  }
+
+  rgw_rados_ref ref;
+  r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
+  if (r < 0)
+    return r;
+
+  bool is_olh = state->is_olh;
+
+  bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
+
+  const string *ptag = meta.ptag;
+  if (!ptag && !index_op->get_optag()->empty()) {
+    ptag = index_op->get_optag();
+  }
+  r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
+  if (r < 0)
+    return r;
+
+  if (real_clock::is_zero(meta.set_mtime)) {
+    meta.set_mtime = real_clock::now();
+  }
+
+  if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
+    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter == attrs.end()) {
+      real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime);
+      string mode = target->get_bucket_info().obj_lock.get_mode();
+      RGWObjectRetention obj_retention(mode, lock_until_date);
+      bufferlist bl;
+      obj_retention.encode(bl);
+      op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
+    }
+  }
+
+  if (state->is_olh) {
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
+  }
+
+  struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
+  op.mtime2(&mtime_ts);
+
+  if (meta.data) {
+    /* if we want to overwrite the data, we also want to overwrite the
+       xattrs, so just remove the object */
+    op.write_full(*meta.data);
+    if (state->compressed) {
+      uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+      op.set_alloc_hint2(0, 0, alloc_hint_flags);
+    }
+  }
+
+  string etag;
+  string content_type;
+  bufferlist acl_bl;
+  string storage_class;
+
+  map<string, bufferlist>::iterator iter;
+  if (meta.rmattrs) {
+    for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      op.rmxattr(name.c_str());
+    }
+  }
+
+  if (meta.manifest) {
+    storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
+
+    /* remove existing manifest attr */
+    iter = attrs.find(RGW_ATTR_MANIFEST);
+    if (iter != attrs.end())
+      attrs.erase(iter);
+
+    bufferlist bl;
+    encode(*meta.manifest, bl);
+    op.setxattr(RGW_ATTR_MANIFEST, bl);
+  }
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    op.setxattr(name.c_str(), bl);
+
+    if (name.compare(RGW_ATTR_ETAG) == 0) {
+      etag = rgw_bl_str(bl);
+    } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+      content_type = rgw_bl_str(bl);
+    } else if (name.compare(RGW_ATTR_ACL) == 0) {
+      acl_bl = bl;
+    }
+  }
+  if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
+    cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
+  }
+
+  if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
+    bufferlist bl;
+    encode(store->svc.zone->get_zone_short_id(), bl);
+    op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
+  }
+
+  if (!storage_class.empty()) {
+    bufferlist bl;
+    bl.append(storage_class);
+    op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
+  }
+
+  if (!op.size())
+    return 0;
+
+  uint64_t epoch;
+  int64_t poolid;
+  bool orig_exists;
+  uint64_t orig_size;
+  
+  if (!reset_obj) {    //Multipart upload, it has immutable head. 
+    orig_exists = false;
+    orig_size = 0;
+  } else {
+    orig_exists = state->exists;
+    orig_size = state->accounted_size;
+  }
+
+  bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
+                          !obj.key.instance.empty();
+
+  bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
+
+  if (versioned_op) {
+    index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
+  }
+
+  if (!index_op->is_prepared()) {
+    tracepoint(rgw_rados, prepare_enter, req_id.c_str());
+    r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+    tracepoint(rgw_rados, prepare_exit, req_id.c_str());
+    if (r < 0)
+      return r;
+  }
+
+  auto& ioctx = ref.pool.ioctx();
+
+  tracepoint(rgw_rados, operate_enter, req_id.c_str());
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  tracepoint(rgw_rados, operate_exit, req_id.c_str());
+  if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
+                or -ENOENT if was removed, or -EEXIST if it did not exist
+                before and now it does */
+    if (r == -EEXIST && assume_noent) {
+      target->invalidate_state();
+      return r;
+    }
+    goto done_cancel;
+  }
+
+  epoch = ioctx.get_last_version();
+  poolid = ioctx.get_id();
+
+  r = target->complete_atomic_modification(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
+  }
+
+  tracepoint(rgw_rados, complete_enter, req_id.c_str());
+  r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
+                        meta.set_mtime, etag, content_type,
+                        storage_class, &acl_bl,
+                        meta.category, meta.remove_objs, meta.user_data, meta.appendable);
+  tracepoint(rgw_rados, complete_exit, req_id.c_str());
+  if (r < 0)
+    goto done_cancel;
+
+  if (meta.mtime) {
+    *meta.mtime = meta.set_mtime;
+  }
+
+  /* note that index_op was using state so we couldn't invalidate it earlier */
+  target->invalidate_state();
+  state = NULL;
+
+  if (versioned_op && meta.olh_epoch) {
+    r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), target->get_target(), false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (!real_clock::is_zero(meta.delete_at)) {
+    rgw_obj_index_key obj_key;
+    obj.key.get_index_key(&obj_key);
+
+    r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
+                                     obj.bucket.bucket_id, obj_key);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
+      /* ignoring error, nothing we can do at this point */
+    }
+  }
+  meta.canceled = false;
+
+  /* update quota cache */
+  if (meta.completeMultipart){
+       store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     0, orig_size);
+  }
+  else {
+    store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     accounted_size, orig_size);  
+  }
+  return 0;
+
+done_cancel:
+  int ret = index_op->cancel(dpp, meta.remove_objs);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+  }
+
+  meta.canceled = true;
+
+  /* we lost in a race. There are a few options:
+   * - existing object was rewritten (ECANCELED)
+   * - non existing object was created (EEXIST)
+   * - object was removed (ENOENT)
+   * should treat it as a success
+   */
+  if (meta.if_match == NULL && meta.if_nomatch == NULL) {
+    if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
+      r = 0;
+    }
+  } else {
+    if (meta.if_match != NULL) {
+      // only overwrite existing object
+      if (strcmp(meta.if_match, "*") == 0) {
+        if (r == -ENOENT) {
+          r = -ERR_PRECONDITION_FAILED;
+        } else if (r == -ECANCELED) {
+          r = 0;
+        }
+      }
+    }
+
+    if (meta.if_nomatch != NULL) {
+      // only create a new object
+      if (strcmp(meta.if_nomatch, "*") == 0) {
+        if (r == -EEXIST) {
+          r = -ERR_PRECONDITION_FAILED;
+        } else if (r == -ENOENT) {
+          r = 0;
+        }
+      }
+    }
+  }
+
+  return r;
+}
+
+int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+                                           map<string, bufferlist>& attrs, optional_yield y)
+{
+  RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+  RGWRados::Bucket bop(target->get_store(), bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
+  index_op.set_zones_trace(meta.zones_trace);
+  
+  bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
+  int r;
+  if (assume_noent) {
+    r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+    if (r == -EEXIST) {
+      assume_noent = false;
+    }
+  }
+  if (!assume_noent) {
+    r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+  }
+  return r;
+}
+
+class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
+{
+  const DoutPrefixProvider *dpp;
+  CephContext* cct;
+  rgw_obj obj;
+  rgw::sal::DataProcessor *filter;
+  boost::optional<RGWPutObj_Compress>& compressor;
+  bool try_etag_verify;
+  rgw::putobj::etag_verifier_ptr etag_verifier;
+  boost::optional<rgw::putobj::ChunkProcessor> buffering;
+  CompressorRef& plugin;
+  rgw::sal::ObjectProcessor *processor;
+  void (*progress_cb)(off_t, void *);
+  void *progress_data;
+  bufferlist extra_data_bl, manifest_bl;
+  std::optional<RGWCompressionInfo> compression_info;
+  uint64_t extra_data_left{0};
+  bool need_to_process_attrs{true};
+  uint64_t data_len{0};
+  map<string, bufferlist> src_attrs;
+  uint64_t ofs{0};
+  uint64_t lofs{0}; /* logical ofs */
+  std::function<int(map<string, bufferlist>&)> attrs_handler;
+
+public:
+  RGWRadosPutObj(const DoutPrefixProvider *dpp,
+                 CephContext* cct,
+                 CompressorRef& plugin,
+                 boost::optional<RGWPutObj_Compress>& compressor,
+                 rgw::sal::ObjectProcessor *p,
+                 void (*_progress_cb)(off_t, void *),
+                 void *_progress_data,
+                 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
+                       dpp(dpp),
+                       cct(cct),
+                       filter(p),
+                       compressor(compressor),
+                       try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
+                       plugin(plugin),
+                       processor(p),
+                       progress_cb(_progress_cb),
+                       progress_data(_progress_data),
+                       attrs_handler(_attrs_handler) {}
+
+
+  int process_attrs(void) {
+    if (extra_data_bl.length()) {
+      JSONParser jp;
+      if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+        ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+        return -EIO;
+      }
+
+      JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+      auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
+      if (iter != src_attrs.end()) {
+        const bufferlist bl = std::move(iter->second);
+        src_attrs.erase(iter); // don't preserve source compression info
+
+        if (try_etag_verify) {
+          // if we're trying to verify etags, we need to convert compressed
+          // ranges in the manifest back into logical multipart part offsets
+          RGWCompressionInfo info;
+          bool compressed = false;
+          int r = rgw_compression_info_from_attr(bl, compressed, info);
+          if (r < 0) {
+            ldpp_dout(dpp, 4) << "failed to decode compression info, "
+                "disabling etag verification" << dendl;
+            try_etag_verify = false;
+          } else if (compressed) {
+            compression_info = std::move(info);
+          }
+        }
+      }
+      /* We need the manifest to recompute the ETag for verification */
+      iter = src_attrs.find(RGW_ATTR_MANIFEST);
+      if (iter != src_attrs.end()) {
+        manifest_bl = std::move(iter->second);
+        src_attrs.erase(iter);
+      }
+
+      // filter out olh attributes
+      iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
+      while (iter != src_attrs.end()) {
+        if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
+          break;
+        }
+        iter = src_attrs.erase(iter);
+      }
+    }
+
+    int ret = attrs_handler(src_attrs);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
+      //do not compress if object is encrypted
+      compressor = boost::in_place(cct, plugin, filter);
+      // add a filter that buffers data so we don't try to compress tiny blocks.
+      // libcurl reads in 16k at a time, and we need at least 64k to get a good
+      // compression ratio
+      constexpr unsigned buffer_size = 512 * 1024;
+      buffering = boost::in_place(&*compressor, buffer_size);
+      filter = &*buffering;
+    }
+
+    /*
+     * Presently we don't support ETag based verification if encryption is
+     * requested. We can enable simultaneous support once we have a mechanism
+     * to know the sequence in which the filters must be applied.
+     */
+    if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
+      ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
+                                              compression_info,
+                                              etag_verifier);
+      if (ret < 0) {
+        ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
+            "disabling etag verification" << dendl;
+      } else {
+        filter = etag_verifier.get();
+      }
+    }
+
+    need_to_process_attrs = false;
+
+    return 0;
+  }
+
+  int handle_data(bufferlist& bl, bool *pause) override {
+    if (progress_cb) {
+      progress_cb(data_len, progress_data);
+    }
+    if (extra_data_left) {
+      uint64_t extra_len = bl.length();
+      if (extra_len > extra_data_left)
+        extra_len = extra_data_left;
+
+      bufferlist extra;
+      bl.splice(0, extra_len, &extra);
+      extra_data_bl.append(extra);
+
+      extra_data_left -= extra_len;
+      if (extra_data_left == 0) {
+        int res = process_attrs();
+        if (res < 0)
+          return res;
+      }
+      ofs += extra_len;
+      if (bl.length() == 0) {
+        return 0;
+      }
+    }
+    if (need_to_process_attrs) {
+      /* need to call process_attrs() even if we don't get any attrs,
+       * need it to call attrs_handler().
+       */
+      int res = process_attrs();
+      if (res < 0) {
+        return res;
+      }
+    }
+
+    ceph_assert(uint64_t(ofs) >= extra_data_len);
+
+    uint64_t size = bl.length();
+    ofs += size;
+
+    const uint64_t lofs = data_len;
+    data_len += size;
+
+    return filter->process(std::move(bl), lofs);
+  }
+
+  int flush() {
+    return filter->process({}, data_len);
+  }
+
+  bufferlist& get_extra_data() { return extra_data_bl; }
+
+  map<string, bufferlist>& get_attrs() { return src_attrs; }
+
+  void set_extra_data_len(uint64_t len) override {
+    extra_data_left = len;
+    RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
+  }
+
+  uint64_t get_data_len() {
+    return data_len;
+  }
+
+  std::string get_verifier_etag() {
+    if (etag_verifier) {
+      etag_verifier->calculate_etag();
+      return etag_verifier->get_calculated_etag();
+    } else {
+      return "";
+    }
+  }
+};
+
+/*
+ * prepare attrset depending on attrs_mod.
+ */
+static void set_copy_attrs(map<string, bufferlist>& src_attrs,
+                           map<string, bufferlist>& attrs,
+                           RGWRados::AttrsMod attrs_mod)
+{
+  switch (attrs_mod) {
+  case RGWRados::ATTRSMOD_NONE:
+    attrs = src_attrs;
+    break;
+  case RGWRados::ATTRSMOD_REPLACE:
+    if (!attrs[RGW_ATTR_ETAG].length()) {
+      attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
+    }
+    if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
+      auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
+      if (ttiter != src_attrs.end()) {
+        attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
+      }
+    }
+    break;
+  case RGWRados::ATTRSMOD_MERGE:
+    for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
+      if (attrs.find(it->first) == attrs.end()) {
+       attrs[it->first] = it->second;
+      }
+    }
+    break;
+  }
+}
+
+int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y)
+{
+  RGWObjectCtx rctx(this->driver);
+  rgw::sal::Attrs attrset;
+  uint64_t obj_size;
+  ceph::real_time mtime;
+  RGWRados::Object op_target(this, obj->get_bucket(), rctx, obj);
+  RGWRados::Object::Read read_op(&op_target);
+
+  read_op.params.attrs = &attrset;
+  read_op.params.obj_size = &obj_size;
+  read_op.params.lastmod = &mtime;
+
+  int ret = read_op.prepare(y, dpp);
+  if (ret < 0)
+    return ret;
+
+  attrset.erase(RGW_ATTR_ID_TAG);
+  attrset.erase(RGW_ATTR_TAIL_TAG);
+  attrset.erase(RGW_ATTR_STORAGE_CLASS);
+
+  return this->copy_obj_data(rctx, obj->get_bucket(),
+                            obj->get_bucket()->get_info().placement_rule,
+                            read_op, obj_size - 1, obj, NULL, mtime,
+                            attrset, 0, real_time(), NULL, dpp, y);
+}
+
+struct obj_time_weight {
+  real_time mtime;
+  uint32_t zone_short_id;
+  uint64_t pg_ver;
+  bool high_precision;
+
+  obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
+
+  bool compare_low_precision(const obj_time_weight& rhs) {
+    struct timespec l = ceph::real_clock::to_timespec(mtime);
+    struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
+    l.tv_nsec = 0;
+    r.tv_nsec = 0;
+    if (l > r) {
+      return false;
+    }
+    if (l < r) {
+      return true;
+    }
+    if (!zone_short_id || !rhs.zone_short_id) {
+      /* don't compare zone ids, if one wasn't provided */
+      return false;
+    }
+    if (zone_short_id != rhs.zone_short_id) {
+      return (zone_short_id < rhs.zone_short_id);
+    }
+    return (pg_ver < rhs.pg_ver);
+
+  }
+
+  bool operator<(const obj_time_weight& rhs) {
+    if (!high_precision || !rhs.high_precision) {
+      return compare_low_precision(rhs);
+    }
+    if (mtime > rhs.mtime) {
+      return false;
+    }
+    if (mtime < rhs.mtime) {
+      return true;
+    }
+    if (!zone_short_id || !rhs.zone_short_id) {
+      /* don't compare zone ids, if one wasn't provided */
+      return false;
+    }
+    if (zone_short_id != rhs.zone_short_id) {
+      return (zone_short_id < rhs.zone_short_id);
+    }
+    return (pg_ver < rhs.pg_ver);
+  }
+
+  void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
+    mtime = _mtime;
+    zone_short_id = _short_id;
+    pg_ver = _pg_ver;
+  }
+
+  void init(RGWObjState *state) {
+    mtime = state->mtime;
+    zone_short_id = state->zone_short_id;
+    pg_ver = state->pg_ver;
+  }
+};
+
+inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
+  out << o.mtime;
+
+  if (o.zone_short_id != 0 || o.pg_ver != 0) {
+    out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
+  }
+
+  return out;
+}
+
+class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
+  bufferlist extra_data;
+public:
+  RGWGetExtraDataCB() {}
+  int handle_data(bufferlist& bl, bool *pause) override {
+    int bl_len = (int)bl.length();
+    if (extra_data.length() < extra_data_len) {
+      off_t max = extra_data_len - extra_data.length();
+      if (max > bl_len) {
+        max = bl_len;
+      }
+      bl.splice(0, max, &extra_data);
+    }
+    return bl_len;
+  }
+
+  bufferlist& get_extra_data() {
+    return extra_data;
+  }
+};
+
+int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
+               RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               rgw::sal::Object* src_obj,
+               const RGWBucketInfo *src_bucket_info,
+               real_time *src_mtime,
+               uint64_t *psize,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               map<string, bufferlist> *pattrs,
+               map<string, string> *pheaders,
+               string *version_id,
+               string *ptag,
+               string *petag)
+{
+  /* source is in a different zonegroup, copy from there */
+
+  RGWRESTStreamRWRequest *in_stream_req;
+  string tag;
+  map<string, bufferlist> src_attrs;
+  append_rand_alpha(cct, tag, tag, 32);
+  obj_time_weight set_mtime_weight;
+  set_mtime_weight.high_precision = high_precision_time;
+
+  RGWRESTConn *conn;
+  if (source_zone.empty()) {
+    if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
+      /* source is in the master zonegroup */
+      conn = svc.zone->get_master_conn();
+    } else {
+      auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+      map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
+      if (iter == zonegroup_conn_map.end()) {
+        ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+        return -ENOENT;
+      }
+      conn = iter->second;
+    }
+  } else {
+    auto& zone_conn_map = svc.zone->get_zone_conn_map();
+    auto iter = zone_conn_map.find(source_zone);
+    if (iter == zone_conn_map.end()) {
+      ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+      return -ENOENT;
+    }
+    conn = iter->second;
+  }
+
+  RGWGetExtraDataCB cb;
+  map<string, string> req_headers;
+  real_time set_mtime;
+
+  const real_time *pmod = mod_ptr;
+
+  obj_time_weight dest_mtime_weight;
+
+  constexpr bool prepend_meta = true;
+  constexpr bool get_op = true;
+  constexpr bool rgwx_stat = true;
+  constexpr bool sync_manifest = true;
+  constexpr bool skip_decrypt = true;
+  int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+                      dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+                      prepend_meta, get_op, rgwx_stat,
+                      sync_manifest, skip_decrypt,
+                      true, &cb, &in_stream_req);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
+                               nullptr, pheaders, null_yield);
+  if (ret < 0) {
+    return ret;
+  }
+
+  bufferlist& extra_data_bl = cb.get_extra_data();
+  if (extra_data_bl.length()) {
+    JSONParser jp;
+    if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+      ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+      return -EIO;
+    }
+
+    JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+    src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+  }
+
+  if (src_mtime) {
+    *src_mtime = set_mtime;
+  }
+
+  if (petag) {
+    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
+    if (iter != src_attrs.end()) {
+      bufferlist& etagbl = iter->second;
+      *petag = etagbl.to_str();
+      while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
+        *petag = petag->substr(0, petag->size() - 1);
+      }
+    }
+  }
+
+  if (pattrs) {
+    *pattrs = std::move(src_attrs);
+  }
+
+  return 0;
+}
+
+int RGWFetchObjFilter_Default::filter(CephContext *cct,
+                                      const rgw_obj_key& source_key,
+                                      const RGWBucketInfo& dest_bucket_info,
+                                      std::optional<rgw_placement_rule> dest_placement_rule,
+                                      const map<string, bufferlist>& obj_attrs,
+                                     std::optional<rgw_user> *poverride_owner,
+                                      const rgw_placement_rule **prule)
+{
+  const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
+  if (!ptail_rule) {
+    auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
+    if (iter != obj_attrs.end()) {
+      dest_rule.storage_class = iter->second.to_str();
+      dest_rule.inherit_from(dest_bucket_info.placement_rule);
+      ptail_rule = &dest_rule;
+    } else {
+      ptail_rule = &dest_bucket_info.placement_rule;
+    }
+  }
+  *prule = ptail_rule;
+  return 0;
+}
+
+int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               rgw::sal::Object* dest_obj,
+               rgw::sal::Object* src_obj,
+               rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               std::optional<rgw_placement_rule> dest_placement_rule,
+               real_time *src_mtime,
+               real_time *mtime,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               AttrsMod attrs_mod,
+               bool copy_if_newer,
+               rgw::sal::Attrs& attrs,
+               RGWObjCategory category,
+               std::optional<uint64_t> olh_epoch,
+              real_time delete_at,
+               string *ptag,
+               string *petag,
+               void (*progress_cb)(off_t, void *),
+               void *progress_data,
+               const DoutPrefixProvider *dpp,
+               RGWFetchObjFilter *filter,
+               rgw_zone_set *zones_trace,
+               std::optional<uint64_t>* bytes_transferred)
+{
+  /* source is in a different zonegroup, copy from there */
+
+  RGWRESTStreamRWRequest *in_stream_req;
+  string tag;
+  int i;
+  append_rand_alpha(cct, tag, tag, 32);
+  obj_time_weight set_mtime_weight;
+  set_mtime_weight.high_precision = high_precision_time;
+  int ret;
+
+  rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+  using namespace rgw::putobj;
+  AtomicObjectProcessor processor(&aio, this->driver, nullptr, user_id,
+                                  obj_ctx, dest_obj->clone(), olh_epoch,
+                                 tag, dpp, null_yield);
+  RGWRESTConn *conn;
+  auto& zone_conn_map = svc.zone->get_zone_conn_map();
+  auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+  if (source_zone.empty()) {
+    if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
+      /* source is in the master zonegroup */
+      conn = svc.zone->get_master_conn();
+    } else {
+      map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
+      if (iter == zonegroup_conn_map.end()) {
+        ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+        return -ENOENT;
+      }
+      conn = iter->second;
+    }
+  } else {
+    auto iter = zone_conn_map.find(source_zone);
+    if (iter == zone_conn_map.end()) {
+      ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+      return -ENOENT;
+    }
+    conn = iter->second;
+  }
+
+  boost::optional<RGWPutObj_Compress> compressor;
+  CompressorRef plugin;
+
+  RGWFetchObjFilter_Default source_filter;
+  if (!filter) {
+    filter = &source_filter;
+  }
+
+  std::optional<rgw_user> override_owner;
+
+  RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
+                    [&](map<string, bufferlist>& obj_attrs) {
+                      const rgw_placement_rule *ptail_rule;
+
+                      int ret = filter->filter(cct,
+                                               src_obj->get_key(),
+                                               dest_bucket->get_info(),
+                                               dest_placement_rule,
+                                               obj_attrs,
+                                              &override_owner,
+                                               &ptail_rule);
+                      if (ret < 0) {
+                        ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
+                        return ret;
+                      }
+
+                      processor.set_tail_placement(*ptail_rule);
+
+                      const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
+                      if (compression_type != "none") {
+                        plugin = Compressor::create(cct, compression_type);
+                        if (!plugin) {
+                          ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+                                        << compression_type << dendl;
+                        }
+                      }
+
+                      ret = processor.prepare(null_yield);
+                      if (ret < 0) {
+                        return ret;
+                      }
+                      return 0;
+                    });
+
+  string etag;
+  real_time set_mtime;
+  uint64_t expected_size = 0;
+
+  RGWObjState *dest_state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  const real_time *pmod = mod_ptr;
+
+  obj_time_weight dest_mtime_weight;
+
+  if (copy_if_newer) {
+    /* need to get mtime for destination */
+    ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
+    if (ret < 0)
+      goto set_err_state;
+
+    if (!real_clock::is_zero(dest_state->mtime)) {
+      dest_mtime_weight.init(dest_state);
+      pmod = &dest_mtime_weight.mtime;
+    }
+  }
+
+  static constexpr bool prepend_meta = true;
+  static constexpr bool get_op = true;
+  static constexpr bool rgwx_stat = false;
+  static constexpr bool sync_manifest = true;
+  static constexpr bool skip_decrypt = true;
+  ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+                      dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+                      prepend_meta, get_op, rgwx_stat,
+                      sync_manifest, skip_decrypt,
+                      true,
+                      &cb, &in_stream_req);
+  if (ret < 0) {
+    goto set_err_state;
+  }
+
+  ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
+                               &expected_size, nullptr, nullptr, null_yield);
+  if (ret < 0) {
+    goto set_err_state;
+  }
+  ret = cb.flush();
+  if (ret < 0) {
+    goto set_err_state;
+  }
+  if (cb.get_data_len() != expected_size) {
+    ret = -EIO;
+    ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
+        << expected_size << " bytes but received " << cb.get_data_len() << dendl;
+    goto set_err_state;
+  }
+  if (compressor && compressor->is_compressed()) {
+    bufferlist tmp;
+    RGWCompressionInfo cs_info;
+    cs_info.compression_type = plugin->get_type_name();
+    cs_info.orig_size = cb.get_data_len();
+    cs_info.compressor_message = compressor->get_compressor_message();
+    cs_info.blocks = move(compressor->get_compression_blocks());
+    encode(cs_info, tmp);
+    cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
+  }
+
+  if (override_owner) {
+    processor.set_owner(*override_owner);
+
+    auto& obj_attrs = cb.get_attrs();
+
+    RGWUserInfo owner_info;
+    if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
+      ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
+      return -EINVAL;
+    }
+
+    RGWAccessControlPolicy acl;
+
+    auto aiter = obj_attrs.find(RGW_ATTR_ACL);
+    if (aiter == obj_attrs.end()) {
+      ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
+      acl.create_default(owner_info.user_id, owner_info.display_name);
+    } else {
+      auto iter = aiter->second.cbegin();
+      try {
+       acl.decode(iter);
+      } catch (buffer::error& err) {
+       ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+       return -EIO;
+      }
+    }
+
+    ACLOwner new_owner;
+    new_owner.set_id(*override_owner);
+    new_owner.set_name(owner_info.display_name);
+
+    acl.set_owner(new_owner);
+
+    bufferlist bl;
+    acl.encode(bl);
+    obj_attrs[RGW_ATTR_ACL] = std::move(bl);
+  }
+
+  if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
+    cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
+  } else {
+    map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
+    if (iter != cb.get_attrs().end()) {
+      try {
+        decode(delete_at, iter->second);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
+      }
+    }
+  }
+
+  if (src_mtime) {
+    *src_mtime = set_mtime;
+  }
+
+  if (petag) {
+    const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
+    if (iter != cb.get_attrs().end()) {
+      *petag = iter->second.to_str();
+    }
+  }
+
+  //erase the append attr
+  cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
+
+  { // add x-amz-replication-status=REPLICA
+    auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS];
+    bl.clear(); // overwrite source's status
+    bl.append("REPLICA");
+  }
+
+  if (source_zone.empty()) {
+    set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
+  } else {
+    attrs = cb.get_attrs();
+  }
+
+  if (copy_if_newer) {
+    uint64_t pg_ver = 0;
+    auto i = attrs.find(RGW_ATTR_PG_VER);
+    if (i != attrs.end() && i->second.length() > 0) {
+      auto iter = i->second.cbegin();
+      try {
+        decode(pg_ver, iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
+        /* non critical error */
+      }
+    }
+    set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
+  }
+
+  /* Perform ETag verification is we have computed the object's MD5 sum at our end */
+  if (const auto& verifier_etag = cb.get_verifier_etag();
+      !verifier_etag.empty()) {
+    string trimmed_etag = etag;
+
+    /* Remove the leading and trailing double quotes from etag */
+    trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
+      trimmed_etag.end());
+
+    if (verifier_etag != trimmed_etag) {
+      ret = -EIO;
+      ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
+        << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
+      goto set_err_state;
+    }
+  }
+
+#define MAX_COMPLETE_RETRY 100
+  for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
+    bool canceled = false;
+    ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
+                             attrs, delete_at, nullptr, nullptr, nullptr,
+                             zones_trace, &canceled, null_yield);
+    if (ret < 0) {
+      goto set_err_state;
+    }
+
+    if (copy_if_newer && canceled) {
+      ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
+      obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
+      ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
+        goto set_err_state;
+      }
+      dest_mtime_weight.init(dest_state);
+      dest_mtime_weight.high_precision = high_precision_time;
+      if (!dest_state->exists ||
+        dest_mtime_weight < set_mtime_weight) {
+        ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+        continue;
+      } else {
+        ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+      }
+    }
+    break;
+  }
+
+  if (i == MAX_COMPLETE_RETRY) {
+    ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
+    ret = -EIO;
+    goto set_err_state;
+  }
+
+  if (bytes_transferred) {
+    *bytes_transferred = cb.get_data_len();
+  }
+  return 0;
+set_err_state:
+  if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
+    // we may have already fetched during sync of OP_ADD, but were waiting
+    // for OP_LINK_OLH to call set_olh() with a real olh_epoch
+    if (olh_epoch && *olh_epoch > 0) {
+      constexpr bool log_data_change = true;
+      ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj, false, nullptr,
+                    *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
+    } else {
+      // we already have the latest copy
+      ret = 0;
+    }
+  }
+  return ret;
+}
+
+
+int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+                                      RGWObjState *astate,
+                                      map<string, bufferlist>& src_attrs,
+                                      RGWRados::Object::Read& read_op,
+                                      const rgw_user& user_id,
+                                      rgw::sal::Object* dest_obj,
+                                      real_time *mtime)
+{
+  string etag;
+
+  RGWRESTStreamS3PutObj *out_stream_req;
+
+  auto rest_master_conn = svc.zone->get_master_conn();
+
+  int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
+  if (ret < 0) {
+    return ret;
+  }
+
+  out_stream_req->set_send_length(astate->size);
+
+  ret = RGWHTTP::send(out_stream_req);
+  if (ret < 0) {
+    delete out_stream_req;
+    return ret;
+  }
+
+  ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
+  if (ret < 0) {
+    delete out_stream_req;
+    return ret;
+  }
+
+  ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+/**
+ * Copy an object.
+ * dest_obj: the object to copy into
+ * src_obj: the object to copy from
+ * attrs: usage depends on attrs_mod parameter
+ * attrs_mod: the modification mode of the attrs, may have the following values:
+ *            ATTRSMOD_NONE - the attributes of the source object will be
+ *                            copied without modifications, attrs parameter is ignored;
+ *            ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+ *                               parameter, source object attributes are not copied;
+ *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+ *                             are overwritten by values contained in attrs parameter.
+ * err: stores any errors resulting from the get of the original object
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               rgw::sal::Object* dest_obj,
+               rgw::sal::Object* src_obj,
+               rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               const rgw_placement_rule& dest_placement,
+               real_time *src_mtime,
+               real_time *mtime,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               AttrsMod attrs_mod,
+               bool copy_if_newer,
+               rgw::sal::Attrs& attrs,
+               RGWObjCategory category,
+               uint64_t olh_epoch,
+              real_time delete_at,
+               string *version_id,
+               string *ptag,
+               string *petag,
+               void (*progress_cb)(off_t, void *),
+               void *progress_data,
+               const DoutPrefixProvider *dpp,
+               optional_yield y)
+{
+  int ret;
+  uint64_t obj_size;
+  rgw_obj shadow_obj = dest_obj->get_obj();
+  string shadow_oid;
+
+  bool remote_src;
+  bool remote_dest;
+
+  append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
+  shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
+
+  auto& zonegroup = svc.zone->get_zonegroup();
+
+  remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
+  remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
+
+  if (remote_src && remote_dest) {
+    ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
+    return -EINVAL;
+  }
+
+  ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
+
+  if (remote_src || !source_zone.empty()) {
+    return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
+               dest_obj, src_obj, dest_bucket, src_bucket,
+               dest_placement, src_mtime, mtime, mod_ptr,
+               unmod_ptr, high_precision_time,
+               if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
+               olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
+               nullptr /* filter */);
+  }
+
+  map<string, bufferlist> src_attrs;
+  RGWRados::Object src_op_target(this, src_bucket, obj_ctx, src_obj);
+  RGWRados::Object::Read read_op(&src_op_target);
+
+  read_op.conds.mod_ptr = mod_ptr;
+  read_op.conds.unmod_ptr = unmod_ptr;
+  read_op.conds.high_precision_time = high_precision_time;
+  read_op.conds.if_match = if_match;
+  read_op.conds.if_nomatch = if_nomatch;
+  read_op.params.attrs = &src_attrs;
+  read_op.params.lastmod = src_mtime;
+  read_op.params.obj_size = &obj_size;
+
+  ret = read_op.prepare(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+  if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
+    // Current implementation does not follow S3 spec and even
+    // may result in data corruption silently when copying
+    // multipart objects acorss pools. So reject COPY operations
+    //on encrypted objects before it is fully functional.
+    ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
+                  << " has not been implemented." << dendl;
+    return -ERR_NOT_IMPLEMENTED;
+  }
+
+  src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+  src_attrs.erase(RGW_ATTR_DELETE_AT);
+
+  src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
+  src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
+  map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+  if (rt != attrs.end())
+    src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
+  map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+  if (lh != attrs.end())
+    src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
+
+  set_copy_attrs(src_attrs, attrs, attrs_mod);
+  attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_PG_VER);
+  attrs.erase(RGW_ATTR_SOURCE_ZONE);
+  map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
+  if (cmp != src_attrs.end())
+    attrs[RGW_ATTR_COMPRESSION] = cmp->second;
+
+  RGWObjManifest manifest;
+  RGWObjState *astate = NULL;
+  RGWObjManifest *amanifest = nullptr;
+
+  ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj, &astate, &amanifest, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  vector<rgw_raw_obj> ref_objs;
+
+  if (remote_dest) {
+    /* dest is in a different zonegroup, copy it there */
+    return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
+  }
+  uint64_t max_chunk_size;
+
+  ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
+    return ret;
+  }
+
+  rgw_pool src_pool;
+  rgw_pool dest_pool;
+
+  const rgw_placement_rule *src_rule{nullptr};
+
+  if (amanifest) {
+    src_rule = &amanifest->get_tail_placement().placement_rule;
+    ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
+  }
+
+  if (!src_rule || src_rule->empty()) {
+    src_rule = &src_bucket->get_placement_rule();
+  }
+
+  if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
+    return -EIO;
+  }
+
+  if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
+    return -EIO;
+  }
+
+  ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
+                             << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
+
+  bool copy_data = (!amanifest) ||
+    (*src_rule != dest_placement) ||
+    (src_pool != dest_pool);
+
+  bool copy_first = false;
+  if (amanifest) {
+    if (!amanifest->has_tail()) {
+      copy_data = true;
+    } else {
+      uint64_t head_size = amanifest->get_head_size();
+
+      if (head_size > 0) {
+        if (head_size > max_chunk_size) {
+          copy_data = true;
+        } else {
+          copy_first = true;
+        }
+      }
+    }
+  }
+
+  if (petag) {
+    const auto iter = attrs.find(RGW_ATTR_ETAG);
+    if (iter != attrs.end()) {
+      *petag = iter->second.to_str();
+    }
+  }
+
+  if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+    attrs.erase(RGW_ATTR_TAIL_TAG);
+    return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
+                         mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
+  }
+
+  /* This has been in for 2 years, so we can safely assume amanifest is not NULL */
+  RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp);
+
+  if (copy_first) { // we need to copy first chunk, not increase refcount
+    ++miter;
+  }
+
+  bufferlist first_chunk;
+
+  const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj());
+  RGWObjManifest *pmanifest; 
+  ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
+
+  RGWRados::Object dest_op_target(this, dest_bucket, obj_ctx, dest_obj);
+  RGWRados::Object::Write write_op(&dest_op_target);
+
+  string tag;
+
+  if (ptag) {
+    tag = *ptag;
+  }
+
+  if (tag.empty()) {
+    append_rand_alpha(cct, tag, tag, 32);
+  }
+
+  std::unique_ptr<rgw::Aio> aio;
+  rgw::AioResultList all_results;
+  if (!copy_itself) {
+    aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y);
+    attrs.erase(RGW_ATTR_TAIL_TAG);
+    manifest = *amanifest;
+    const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
+    if (tail_placement.bucket.name.empty()) {
+      manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
+    }
+    string ref_tag;
+    for (; miter != amanifest->obj_end(dpp); ++miter) {
+      ObjectWriteOperation op;
+      ref_tag = tag + '\0';
+      cls_refcount_get(op, ref_tag, true);
+
+      auto obj = svc.rados->obj(miter.get_location().get_raw_obj(driver));
+      ret = obj.open(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl;
+        goto done_ret;
+      }
+
+      static constexpr uint64_t cost = 1; // 1 throttle unit per request
+      static constexpr uint64_t id = 0; // ids unused
+      rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+      ret = rgw::check_for_errors(completed);
+      all_results.splice(all_results.end(), completed);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl;
+        goto done_ret;
+      }
+    }
+
+    rgw::AioResultList completed = aio->drain();
+    ret = rgw::check_for_errors(completed);
+    all_results.splice(all_results.end(), completed);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <<dendl;
+      goto done_ret;
+    }
+
+    pmanifest = &manifest;
+  } else {
+    pmanifest = amanifest;
+    /* don't send the object's tail for garbage collection */
+    astate->keep_tail = true;
+  }
+
+  if (copy_first) {
+    ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
+    if (ret < 0) {
+      goto done_ret;
+    }
+
+    pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
+  } else {
+    pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
+  }
+
+  write_op.meta.data = &first_chunk;
+  write_op.meta.manifest = pmanifest;
+  write_op.meta.ptag = &tag;
+  write_op.meta.owner = dest_bucket->get_info().owner;
+  write_op.meta.mtime = mtime;
+  write_op.meta.flags = PUT_OBJ_CREATE;
+  write_op.meta.category = category;
+  write_op.meta.olh_epoch = olh_epoch;
+  write_op.meta.delete_at = delete_at;
+  write_op.meta.modify_tail = !copy_itself;
+
+  ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
+  if (ret < 0) {
+    goto done_ret;
+  }
+
+  return 0;
+
+done_ret:
+  if (!copy_itself) {
+
+    /* wait all pending op done */
+    rgw::AioResultList completed = aio->drain();
+    all_results.splice(all_results.end(), completed);
+
+    /* rollback reference */
+    string ref_tag = tag + '\0';
+    int ret2 = 0;
+    for (auto& r : all_results) {
+      if (r.result < 0) {
+        continue; // skip errors
+      }
+      ObjectWriteOperation op;
+      cls_refcount_put(op, ref_tag, true);
+
+      static constexpr uint64_t cost = 1; // 1 throttle unit per request
+      static constexpr uint64_t id = 0; // ids unused
+      rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+      ret2 = rgw::check_for_errors(completed);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl;
+      }
+    }
+    completed = aio->drain();
+    ret2 = rgw::check_for_errors(completed);
+    if (ret2 < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <<dendl;
+    }
+  }
+  return ret;
+}
+
+
+int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
+               rgw::sal::Bucket* bucket,
+               const rgw_placement_rule& dest_placement,
+              RGWRados::Object::Read& read_op, off_t end,
+               rgw::sal::Object* dest_obj,
+              real_time *mtime,
+              real_time set_mtime,
+               rgw::sal::Attrs& attrs,
+               uint64_t olh_epoch,
+              real_time delete_at,
+               string *petag,
+               const DoutPrefixProvider *dpp,
+               optional_yield y)
+{
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
+
+  rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+  using namespace rgw::putobj;
+  // do not change the null_yield in the initialization of this AtomicObjectProcessor
+  // it causes crashes in the ragweed tests
+  AtomicObjectProcessor processor(&aio, this->driver, &dest_placement,
+                                  bucket->get_info().owner, obj_ctx,
+                                  dest_obj->clone(), olh_epoch, tag,
+                                 dpp, null_yield);
+  int ret = processor.prepare(y);
+  if (ret < 0)
+    return ret;
+
+  off_t ofs = 0;
+
+  do {
+    bufferlist bl;
+    ret = read_op.read(ofs, end, bl, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
+      return ret;
+    }
+
+    uint64_t read_len = ret;
+    ret = processor.process(std::move(bl), ofs);
+    if (ret < 0) {
+      return ret;
+    }
+
+    ofs += read_len;
+  } while (ofs <= end);
+
+  // flush
+  ret = processor.process({}, ofs);
+  if (ret < 0) {
+    return ret;
+  }
+
+  string etag;
+  auto iter = attrs.find(RGW_ATTR_ETAG);
+  if (iter != attrs.end()) {
+    bufferlist& bl = iter->second;
+    etag = bl.to_str();
+    if (petag) {
+      *petag = etag;
+    }
+  }
+
+  uint64_t accounted_size;
+  {
+    bool compressed{false};
+    RGWCompressionInfo cs_info;
+    ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
+      return ret;
+    }
+    // pass original size if compressed
+    accounted_size = compressed ? cs_info.orig_size : ofs;
+  }
+
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+                            nullptr, nullptr, nullptr, nullptr, nullptr, y);
+}
+
+int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
+                            rgw::sal::Bucket* bucket,
+                             rgw::sal::Object& obj,
+                             const rgw_placement_rule& placement_rule,
+                             const real_time& mtime,
+                             uint64_t olh_epoch,
+                             const DoutPrefixProvider *dpp,
+                             optional_yield y)
+{
+  rgw::sal::Attrs attrs;
+  real_time read_mtime;
+  uint64_t obj_size;
+
+  obj.set_atomic();
+  RGWRados::Object op_target(this, bucket, obj_ctx, &obj);
+  RGWRados::Object::Read read_op(&op_target);
+
+  read_op.params.attrs = &attrs;
+  read_op.params.lastmod = &read_mtime;
+  read_op.params.obj_size = &obj_size;
+
+  int ret = read_op.prepare(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (read_mtime != mtime) {
+    /* raced */
+    return -ECANCELED;
+  }
+
+  attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_TAIL_TAG);
+
+  ret = copy_obj_data(obj_ctx,
+                      bucket,
+                      placement_rule,
+                      read_op,
+                      obj_size - 1,
+                      &obj,
+                      nullptr /* pmtime */,
+                      mtime,
+                      attrs,
+                      olh_epoch,
+                      real_time(),
+                      nullptr /* petag */,
+                      dpp,
+                      y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
+{
+  constexpr uint NUM_ENTRIES = 1000u;
+
+  rgw_obj_index_key marker;
+  string prefix;
+  bool is_truncated;
+
+  do {
+    std::vector<rgw_bucket_dir_entry> ent_list;
+    ent_list.reserve(NUM_ENTRIES);
+
+    int r = cls_bucket_list_unordered(dpp,
+                                      bucket_info,
+                                      bucket_info.layout.current_index,
+                                      RGW_NO_SHARD,
+                                     marker,
+                                     prefix,
+                                     NUM_ENTRIES,
+                                     true,
+                                     ent_list,
+                                     &is_truncated,
+                                     &marker,
+                                      y);
+    if (r < 0) {
+      return r;
+    }
+
+    string ns;
+    for (auto const& dirent : ent_list) {
+      rgw_obj_key obj;
+
+      if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
+        return -ENOTEMPTY;
+      }
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+  
+/**
+ * Delete a bucket.
+ * bucket: the name of the bucket to delete
+ * Returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+  
+  if (check_empty) {
+    r = check_bucket_empty(dpp, bucket_info, y);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  bool remove_ep = true;
+
+  if (objv_tracker.read_version.empty()) {
+    RGWBucketEntryPoint ep;
+    r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
+                                                &ep,
+                                               null_yield,
+                                                dpp,
+                                                RGWBucketCtl::Bucket::GetParams()
+                                                .set_objv_tracker(&objv_tracker));
+    if (r < 0 ||
+        (!bucket_info.bucket.bucket_id.empty() &&
+         ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
+      if (r != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
+        /* we have no idea what caused the error, will not try to remove it */
+      }
+      /* 
+       * either failed to read bucket entrypoint, or it points to a different bucket instance than
+       * requested
+       */
+      remove_ep = false;
+    }
+  }
+ 
+  if (remove_ep) {
+    r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
+                                                  RGWBucketCtl::Bucket::RemoveParams()
+                                                  .set_objv_tracker(&objv_tracker));
+    if (r < 0)
+      return r;
+  }
+
+  /* if the bucket is not synced we can remove the meta file */
+  if (!svc.zone->is_syncing_bucket_meta(bucket)) {
+    RGWObjVersionTracker objv_tracker;
+    r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
+    if (r < 0) {
+      return r;
+    }
+
+   /* remove bucket index objects asynchronously by best effort */
+    (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
+                                      bucket_objs,
+                                      cct->_conf->rgw_bucket_index_max_aio)();
+  }
+
+  return 0;
+}
+
+int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
+{
+  RGWBucketInfo info;
+  map<string, bufferlist> attrs;
+  int r;
+
+  if (bucket.bucket_id.empty()) {
+    r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+  } else {
+    r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp);
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+    return r;
+  }
+
+  info.owner = owner.get_id();
+
+  r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+
+  vector<rgw_bucket>::iterator iter;
+
+  for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
+    rgw_bucket& bucket = *iter;
+    if (enabled) {
+      ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
+    } else {
+      ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
+    }
+
+    RGWBucketInfo info;
+    map<string, bufferlist> attrs;
+    int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+      ret = r;
+      continue;
+    }
+    if (enabled) {
+      info.flags &= ~BUCKET_SUSPENDED;
+    } else {
+      info.flags |= BUCKET_SUSPENDED;
+    }
+
+    r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+      ret = r;
+      continue;
+    }
+  }
+  return ret;
+}
+
+int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
+{
+  RGWBucketInfo bucket_info;
+  int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
+  return 0;
+}
+
+int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
+{
+  if ((!manifest)|| state->keep_tail)
+    return 0;
+
+  cls_rgw_obj_chain chain;
+  store->update_gc_chain(dpp, obj->get_obj(), *manifest, &chain);
+
+  if (chain.empty()) {
+    return 0;
+  }
+
+  string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
+  if (store->gc == nullptr) {
+    ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
+    //Delete objects inline just in case gc hasn't been initialised, prevents crashes
+    store->delete_objs_inline(dpp, chain, tag);
+  } else {
+    auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
+    if (ret < 0 && leftover_chain) {
+      //Delete objects inline if send chain to gc fails
+      store->delete_objs_inline(dpp, *leftover_chain, tag);
+    }
+  }
+  return 0;
+}
+
+void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
+{
+  RGWObjManifest::obj_iterator iter;
+  rgw_raw_obj raw_head;
+  obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
+  for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
+    const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(driver);
+    if (mobj == raw_head)
+      continue;
+    cls_rgw_obj_key key(mobj.oid);
+    chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
+  }
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
+{
+  if (chain.empty()) {
+    return {0, std::nullopt};
+  }
+
+  return gc->send_split_chain(chain, tag);
+}
+
+void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
+{
+  string last_pool;
+  std::unique_ptr<IoCtx> ctx(new IoCtx);
+  int ret = 0;
+  for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+    cls_rgw_obj& obj = *liter;
+    if (obj.pool != last_pool) {
+      ctx.reset(new IoCtx);
+      ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
+      if (ret < 0) {
+        last_pool = "";
+        ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
+        obj.pool << dendl;
+        continue;
+      }
+      last_pool = obj.pool;
+    }
+    ctx->locator_set_key(obj.loc);
+    const string& oid = obj.key.name; /* just stored raw oid there */
+    ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
+    ":" << obj.key.name << dendl;
+    ObjectWriteOperation op;
+    cls_refcount_put(op, tag, true);
+    ret = ctx->operate(oid, &op);
+    if (ret < 0) {
+      ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
+    }
+  }
+}
+
+static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
+                                 map<RGWObjCategory, RGWStorageStats>& stats)
+{
+  for (const auto& pair : header.stats) {
+    const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
+    const rgw_bucket_category_stats& header_stats = pair.second;
+
+    RGWStorageStats& s = stats[category];
+
+    s.category = category;
+    s.size += header_stats.total_size;
+    s.size_rounded += header_stats.total_size_rounded;
+    s.size_utilized += header_stats.actual_size;
+    s.num_objects += header_stats.num_entries;
+  }
+}
+
+int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                                map<RGWObjCategory, RGWStorageStats> *existing_stats,
+                                map<RGWObjCategory, RGWStorageStats> *calculated_stats)
+{
+  RGWSI_RADOS::Pool index_pool;
+
+  // key - bucket index object id
+  // value - bucket index check OP returned result with the given bucket index object (shard)
+  map<int, string> oids;
+
+  int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // declare and pre-populate
+  map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
+  for (auto& iter : oids) {
+    bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
+  }
+
+  ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
+  if (ret < 0) {
+    return ret;
+  }
+
+  // aggregate results (from different shards if there are any)
+  for (const auto& iter : bucket_objs_ret) {
+    accumulate_raw_stats(iter.second.existing_header, *existing_stats);
+    accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
+  }
+
+  return 0;
+}
+
+int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": unable to open bucket index, r=" << r << " (" <<
+      cpp_strerror(-r) << ")" << dendl;
+    return r;
+  }
+
+  r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": unable to issue set bucket resharding, r=" << r << " (" <<
+      cpp_strerror(-r) << ")" << dendl;
+  }
+  return r;
+}
+
+int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y)
+{
+  std::string oid, key;
+  get_obj_bucket_and_oid_loc(obj->get_obj(), oid, key);
+  if (!rctx)
+    return 0;
+
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  if (!state->is_atomic) {
+    ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
+    return -EINVAL;
+  }
+
+  string tag;
+
+  if (state->tail_tag.length() > 0) {
+    tag = state->tail_tag.c_str();
+  } else if (state->obj_tag.length() > 0) {
+    tag = state->obj_tag.c_str();
+  } else {
+    ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
+    return -EINVAL;
+  }
+
+  ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
+
+  cls_rgw_obj_chain chain;
+  update_gc_chain(dpp, state->obj, *manifest, &chain);
+  return gc->async_defer_chain(tag, chain);
+}
+
+void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
+{
+  list<string> prefixes;
+  prefixes.push_back(RGW_ATTR_OLH_PREFIX);
+  cls_rgw_remove_obj(op, prefixes);
+}
+
+void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
+{
+  cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
+}
+
+void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
+{
+  cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
+}
+
+struct tombstone_entry {
+  ceph::real_time mtime;
+  uint32_t zone_short_id;
+  uint64_t pg_ver;
+
+  tombstone_entry() = default;
+  explicit tombstone_entry(const RGWObjState& state)
+    : mtime(state.mtime), zone_short_id(state.zone_short_id),
+      pg_ver(state.pg_ver) {}
+};
+
+/**
+ * Delete an object.
+ * bucket: name of the bucket storing the object
+ * obj: name of the object to delete
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWRados *store = target->get_store();
+  const string& instance = target->get_instance();
+  rgw_obj obj = target->get_obj();
+
+  if (instance == "null") {
+    obj.key.instance.clear();
+  }
+
+  bool explicit_marker_version = (!params.marker_version_id.empty());
+
+  if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
+    if (instance.empty() || explicit_marker_version) {
+      std::unique_ptr<rgw::sal::Object> marker = target->get_target()->clone();
+      marker->clear_instance();
+
+      if (!params.marker_version_id.empty()) {
+        if (params.marker_version_id != "null") {
+          marker->set_instance(params.marker_version_id);
+        }
+      } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
+       marker->gen_rand_obj_instance_name();
+      }
+
+      result.version_id = marker->get_instance();
+      if (result.version_id.empty())
+        result.version_id = "null";
+      result.delete_marker = true;
+
+      struct rgw_bucket_dir_entry_meta meta;
+
+      meta.owner = params.obj_owner.get_id().to_str();
+      meta.owner_display_name = params.obj_owner.get_display_name();
+
+      if (real_clock::is_zero(params.mtime)) {
+        meta.mtime = real_clock::now();
+      } else {
+        meta.mtime = params.mtime;
+      }
+
+      int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker.get(), true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
+      if (r < 0) {
+        return r;
+      }
+    } else {
+      rgw_bucket_dir_entry dirent;
+
+      int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
+      if (r < 0) {
+        return r;
+      }
+      result.delete_marker = dirent.is_delete_marker();
+      r = store->unlink_obj_instance(dpp, target->get_bucket_info(), target->get_target(), params.olh_epoch, y, params.zones_trace);
+      if (r < 0) {
+        return r;
+      }
+      result.version_id = instance;
+    }
+
+    BucketShard *bs = nullptr;
+    int r = target->get_bucket_shard(&bs, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
+      return r;
+    }
+
+    add_datalog_entry(dpp, store->svc.datalog_rados,
+                      target->get_bucket_info(), bs->shard_id);
+
+    return 0;
+  }
+
+  rgw_rados_ref ref;
+  int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWObjState *state;
+  RGWObjManifest *manifest = nullptr;
+  r = target->get_state(dpp, &state, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  ObjectWriteOperation op;
+
+  if (!real_clock::is_zero(params.unmod_since)) {
+    struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
+    struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
+    if (!params.high_precision_time) {
+      ctime.tv_nsec = 0;
+      unmod.tv_nsec = 0;
+    }
+
+    ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
+    if (ctime > unmod) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+
+    /* only delete object if mtime is less than or equal to params.unmod_since */
+    store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
+  }
+  uint64_t obj_accounted_size = state->accounted_size;
+
+  if(params.abortmp) {
+    obj_accounted_size = params.parts_accounted_size;
+  }
+
+  if (!real_clock::is_zero(params.expiration_time)) {
+    bufferlist bl;
+    real_time delete_at;
+
+    if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
+      try {
+        auto iter = bl.cbegin();
+        decode(delete_at, iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
+       return -EIO;
+      }
+
+      if (params.expiration_time != delete_at) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    } else {
+      return -ERR_PRECONDITION_FAILED;
+    }
+  }
+
+  if (!state->exists) {
+    target->invalidate_state();
+    return -ENOENT;
+  }
+
+  r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
+  if (r < 0)
+    return r;
+
+  RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+  RGWRados::Bucket bop(store, bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+  
+  index_op.set_zones_trace(params.zones_trace);
+  index_op.set_bilog_flags(params.bilog_flags);
+
+  r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
+  if (r < 0)
+    return r;
+
+  store->remove_rgw_head_obj(op);
+
+  auto& ioctx = ref.pool.ioctx();
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+
+  /* raced with another operation, object state is indeterminate */
+  const bool need_invalidate = (r == -ECANCELED);
+
+  int64_t poolid = ioctx.get_id();
+  if (r >= 0) {
+    tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
+    if (obj_tombstone_cache) {
+      tombstone_entry entry{*state};
+      obj_tombstone_cache->add(obj, entry);
+    }
+    r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
+    
+    int ret = target->complete_atomic_modification(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
+    }
+    /* other than that, no need to propagate error */
+  } else {
+    int ret = index_op.cancel(dpp, params.remove_objs);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+    }
+  }
+
+  if (need_invalidate) {
+    target->invalidate_state();
+  }
+
+  if (r < 0)
+    return r;
+
+  /* update quota cache */
+  store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
+
+  return 0;
+}
+
+int RGWRados::delete_obj(rgw::sal::Driver* store,
+                        const DoutPrefixProvider *dpp,
+                         const RGWBucketInfo& bucket_info,
+                         const rgw_obj& obj,
+                         int versioning_status, // versioning flags defined in enum RGWBucketFlags
+                         uint16_t bilog_flags,
+                         const real_time& expiration_time,
+                         rgw_zone_set *zones_trace)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  store->get_bucket(nullptr, bucket_info, &bucket);
+  std::unique_ptr<rgw::sal::Object> object = bucket->get_object(obj.key);
+
+  return delete_obj(dpp, bucket_info, object.get(), versioning_status,
+                   bilog_flags, expiration_time, zones_trace);
+}
+
+int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
+                         const RGWBucketInfo& bucket_info,
+                         rgw::sal::Object* obj,
+                         int versioning_status, // versioning flags defined in enum RGWBucketFlags
+                         uint16_t bilog_flags,
+                         const real_time& expiration_time,
+                         rgw_zone_set *zones_trace)
+{
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+
+  del_op->params.bucket_owner = bucket_info.owner;
+  del_op->params.versioning_status = versioning_status;
+  del_op->params.bilog_flags = bilog_flags;
+  del_op->params.expiration_time = expiration_time;
+  del_op->params.zones_trace = zones_trace;
+
+  return del_op->delete_obj(dpp, null_yield);
+}
+
+int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  ObjectWriteOperation op;
+
+  op.remove();
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
+{
+  std::string oid, key;
+  get_obj_bucket_and_oid_loc(obj, oid, key);
+
+  RGWBucketInfo bucket_info;
+  int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  RGWRados::Bucket bop(this, bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+  return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
+}
+
+static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Driver* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
+{
+  string tag;
+
+  RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
+  if (mi != manifest.obj_end(dpp)) {
+    if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
+      ++mi;
+    rgw::sal::RadosStore* rstore = dynamic_cast<rgw::sal::RadosStore*>(store);
+    tag = mi.get_location().get_raw_obj(rstore).oid;
+    tag.append("_");
+  }
+
+  unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
+
+  map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
+  if (iter != attrset.end()) {
+    bufferlist& bl = iter->second;
+    hash.Update((const unsigned char *)bl.c_str(), bl.length());
+  }
+
+  hash.Final(md5);
+  buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
+  tag.append(md5_str);
+
+  ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
+
+  tag_bl.append(tag.c_str(), tag.size() + 1);
+}
+
+static bool is_olh(map<string, bufferlist>& attrs)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
+  return (iter != attrs.end());
+}
+
+static bool has_olh_tag(map<string, bufferlist>& attrs)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
+  return (iter != attrs.end());
+}
+
+int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx&
+                                  obj_ctx, RGWBucketInfo& bucket_info,
+                                  rgw::sal::Object* obj, RGWObjState *olh_state,
+                                  RGWObjState **target_state,
+                                  RGWObjManifest **target_manifest, optional_yield y)
+{
+  ceph_assert(olh_state->is_olh);
+
+  rgw_obj target;
+  int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
+  if (r < 0) {
+    return r;
+  }
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  driver->get_bucket(nullptr, bucket_info, &bucket);
+  std::unique_ptr<rgw::sal::Object> target_obj = bucket->get_object(target.key);
+
+  r = get_obj_state(dpp, &obj_ctx, bucket_info, target_obj.get(), target_state,
+                   target_manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
+                                RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+                                 RGWObjState **state, RGWObjManifest** manifest,
+                                bool follow_olh, optional_yield y, bool assume_noent)
+{
+  if (obj->empty()) {
+    return -EINVAL;
+  }
+
+  bool need_follow_olh = follow_olh && obj->get_obj().key.instance.empty();
+  *manifest = nullptr;
+
+  RGWObjStateManifest *sm = rctx->get_state(obj->get_obj());
+  RGWObjState *s = &(sm->state);
+  ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
+  *state = s;
+  if (sm->manifest) {
+    *manifest = &(*sm->manifest);
+  }
+  if (s->has_attrs) {
+    if (s->is_olh && need_follow_olh) {
+      return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+    }
+    return 0;
+  }
+
+  s->obj = obj->get_obj();
+
+  rgw_raw_obj raw_obj;
+  obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &raw_obj);
+
+  int r = -ENOENT;
+
+  if (!assume_noent) {
+    r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
+  }
+
+  if (r == -ENOENT) {
+    s->exists = false;
+    s->has_attrs = true;
+    tombstone_entry entry;
+    if (obj_tombstone_cache && obj_tombstone_cache->find(obj->get_obj(), entry)) {
+      s->mtime = entry.mtime;
+      s->zone_short_id = entry.zone_short_id;
+      s->pg_ver = entry.pg_ver;
+      ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
+          << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
+    } else {
+      s->mtime = real_time();
+    }
+    return 0;
+  }
+  if (r < 0)
+    return r;
+
+  s->exists = true;
+  s->has_attrs = true;
+  s->accounted_size = s->size;
+
+  auto iter = s->attrset.find(RGW_ATTR_ETAG);
+  if (iter != s->attrset.end()) {
+    /* get rid of extra null character at the end of the etag, as we used to store it like that */
+    bufferlist& bletag = iter->second;
+    if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
+      bufferlist newbl;
+      bletag.splice(0, bletag.length() - 1, &newbl);
+      bletag = std::move(newbl);
+    }
+  }
+
+  iter = s->attrset.find(RGW_ATTR_COMPRESSION);
+  const bool compressed = (iter != s->attrset.end());
+  if (compressed) {
+    // use uncompressed size for accounted_size
+    try {
+      RGWCompressionInfo info;
+      auto p = iter->second.cbegin();
+      decode(info, p);
+      s->accounted_size = info.orig_size; 
+    } catch (buffer::error&) {
+      ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
+      return -EIO;
+    }
+  }
+
+  iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
+  if (iter != s->attrset.end()) {
+    bufferlist bl = iter->second;
+    bufferlist::iterator it = bl.begin();
+    it.copy(bl.length(), s->shadow_obj);
+    s->shadow_obj[bl.length()] = '\0';
+  }
+  s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
+  auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
+  if (ttiter != s->attrset.end()) {
+    s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
+  }
+
+  bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
+  if (manifest_bl.length()) {
+    auto miter = manifest_bl.cbegin();
+    try {
+      sm->manifest.emplace();
+      decode(*sm->manifest, miter);
+      sm->manifest->set_head(bucket_info.placement_rule, obj->get_obj(), s->size); /* patch manifest to reflect the head we just read, some manifests might be
+                                             broken due to old bugs */
+      s->size = sm->manifest->get_obj_size();
+      if (!compressed)
+        s->accounted_size = s->size;
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+      return -EIO;
+    }
+    *manifest = &(*sm->manifest);
+    ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl;
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
+       sm->manifest->has_explicit_objs()) {
+      RGWObjManifest::obj_iterator mi;
+      for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) {
+        ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(driver) << dendl;
+      }
+    }
+
+    if (!s->obj_tag.length()) {
+      /*
+       * Uh oh, something's wrong, object with manifest should have tag. Let's
+       * create one out of the manifest, would be unique
+       */
+      generate_fake_tag(dpp, driver, s->attrset, *sm->manifest, manifest_bl, s->obj_tag);
+      s->fake_tag = true;
+    }
+  }
+  map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
+  if (aiter != s->attrset.end()) {
+    bufferlist& pg_ver_bl = aiter->second;
+    if (pg_ver_bl.length()) {
+      auto pgbl = pg_ver_bl.cbegin();
+      try {
+        decode(s->pg_ver, pgbl);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+      }
+    }
+  }
+  aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
+  if (aiter != s->attrset.end()) {
+    bufferlist& zone_short_id_bl = aiter->second;
+    if (zone_short_id_bl.length()) {
+      auto zbl = zone_short_id_bl.cbegin();
+      try {
+        decode(s->zone_short_id, zbl);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+      }
+    }
+  }
+  if (s->obj_tag.length()) {
+    ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
+  } else {
+    ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
+  }
+
+  /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
+   * it exist, and not only if is_olh() returns true
+   */
+  iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
+  if (iter != s->attrset.end()) {
+    s->olh_tag = iter->second;
+  }
+
+  if (is_olh(s->attrset)) {
+    s->is_olh = true;
+
+    ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
+
+    if (need_follow_olh) {
+      return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+    } else if (obj->get_obj().key.have_null_instance() && !sm->manifest) {
+      // read null version, and the head object only have olh info
+      s->exists = false;
+      return -ENOENT;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
+                            bool follow_olh, optional_yield y, bool assume_noent)
+{
+  int ret;
+
+  do {
+    ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent);
+  } while (ret == -EAGAIN);
+
+  return ret;
+}
+
+int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
+{
+  RGWObjState *astate;
+  int r = get_state(dpp, &astate, pmanifest, true, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
+{
+  RGWObjState *state;
+  RGWObjManifest *manifest = nullptr;
+  int r = source->get_state(dpp, &state, &manifest, true, y);
+  if (r < 0)
+    return r;
+  if (!state->exists)
+    return -ENOENT;
+  if (!state->get_attr(name, dest))
+    return -ENODATA;
+
+  return 0;
+}
+
+int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
+{
+  rgw::sal::Object* target = source->get_target(); 
+  rgw_obj obj = target->get_obj();
+  RGWRados *store = source->get_store();
+
+  result.obj = obj;
+  if (target->has_attrs()) {
+    state.ret = 0;
+    result.size = target->get_obj_size();
+    result.mtime = ceph::real_clock::to_timespec(target->get_mtime());
+    result.attrs = target->get_attrs();
+    //result.manifest = sm->manifest;
+    return 0;
+  }
+
+  string oid;
+  string loc;
+  get_obj_bucket_and_oid_loc(obj, oid, loc);
+
+  int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+  op.stat2(&result.size, &result.mtime, NULL);
+  op.getxattrs(&result.attrs, NULL);
+  state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
+  state.io_ctx.locator_set_key(loc);
+  r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
+  if (r < 0) {
+    ldpp_dout(dpp, 5) << __func__
+                                                  << ": ERROR: aio_operate() returned ret=" << r
+                                                  << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
+{
+  if (!state.completion) {
+    return state.ret;
+  }
+
+  state.completion->wait_for_complete();
+  state.ret = state.completion->get_return_value();
+  state.completion->release();
+
+  if (state.ret != 0) {
+    return state.ret;
+  }
+
+  return finish(dpp);
+}
+
+int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
+{
+  map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
+  if (iter != result.attrs.end()) {
+    bufferlist& bl = iter->second;
+    auto biter = bl.cbegin();
+    try {
+      result.manifest.emplace();
+      decode(*result.manifest, biter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest"  << dendl;
+      return -EIO;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
+                                 RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+                                 ObjectOperation& op, RGWObjState **pstate,
+                                RGWObjManifest** pmanifest, optional_yield y)
+{
+  int r = obj->get_obj_state(dpp, pstate, y, false);
+  if (r < 0)
+    return r;
+
+  return append_atomic_test(dpp, *pstate, op);
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
+                                 const RGWObjState* state,
+                                 librados::ObjectOperation& op)
+{
+  if (!state->is_atomic) {
+    ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
+    return 0;
+  }
+
+  if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
+    op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
+  } else {
+    ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
+  }
+  return 0;
+}
+
+int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent)
+{
+  int r = obj->get_obj_state(dpp, pstate, y, follow_olh);
+  if (r < 0) {
+    return r;
+  }
+  *pmanifest = static_cast<rgw::sal::RadosObject*>(obj)->get_manifest();
+
+  return r;
+}
+
+void RGWRados::Object::invalidate_state()
+{
+  obj->invalidate();
+}
+
+int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
+                                                  ObjectWriteOperation& op, bool reset_obj, const string *ptag,
+                                                  const char *if_match, const char *if_nomatch, bool removal_op,
+                                                  bool modify_tail, optional_yield y)
+{
+  int r = get_state(dpp, &state, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  bool need_guard = ((manifest) || (state->obj_tag.length() != 0) ||
+                     if_match != NULL || if_nomatch != NULL) &&
+                     (!state->fake_tag);
+
+  if (!state->is_atomic) {
+    ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
+
+    if (reset_obj) {
+      op.create(false);
+      store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
+    }
+
+    return 0;
+  }
+
+  if (need_guard) {
+    /* first verify that the object wasn't replaced under */
+    if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
+      op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); 
+      // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
+    }
+
+    if (if_match) {
+      if (strcmp(if_match, "*") == 0) {
+        // test the object is existing
+        if (!state->exists) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      } else {
+        bufferlist bl;
+        if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+            strncmp(if_match, bl.c_str(), bl.length()) != 0) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      }
+    }
+
+    if (if_nomatch) {
+      if (strcmp(if_nomatch, "*") == 0) {
+        // test the object is NOT existing
+        if (state->exists) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      } else {
+        bufferlist bl;
+        if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+            strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      }
+    }
+  }
+
+  if (reset_obj) {
+    if (state->exists) {
+      op.create(false);
+      store->remove_rgw_head_obj(op);
+    } else {
+      op.create(true);
+    }
+  }
+
+  if (removal_op) {
+    /* the object is being removed, no need to update its tag */
+    return 0;
+  }
+
+  if (ptag) {
+    state->write_tag = *ptag;
+  } else {
+    append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
+  }
+  bufferlist bl;
+  bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
+
+  ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
+
+  op.setxattr(RGW_ATTR_ID_TAG, bl);
+  if (modify_tail) {
+    op.setxattr(RGW_ATTR_TAIL_TAG, bl);
+  }
+
+  return 0;
+}
+
+/**
+ * Set an attr on an object.
+ * bucket: name of the bucket holding the object
+ * obj: name of the object to set the attr on
+ * name: the attr to set
+ * bl: the contents of the attr
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl)
+{
+  map<string, bufferlist> attrs;
+  attrs[name] = bl;
+  return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield);
+}
+
+int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* src_obj,
+                        map<string, bufferlist>& attrs,
+                        map<string, bufferlist>* rmattrs,
+                        optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Object> obj = src_obj->clone();
+  if (obj->get_instance() == "null") {
+    obj->clear_instance();
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  ObjectWriteOperation op;
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  r = append_atomic_test(dpp, bucket_info, obj.get(), op, &state, &manifest, y);
+  if (r < 0)
+    return r;
+
+  // ensure null version object exist
+  if (src_obj->get_instance() == "null" && !manifest) {
+    return -ENOENT;
+  }
+
+  map<string, bufferlist>::iterator iter;
+  if (rmattrs) {
+    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      op.rmxattr(name.c_str());
+    }
+  }
+
+  const rgw_bucket& bucket = obj->get_bucket()->get_key();
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    op.setxattr(name.c_str(), bl);
+
+    if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
+      real_time ts;
+      try {
+        decode(ts, bl);
+
+        rgw_obj_index_key obj_key;
+        obj->get_key().get_index_key(&obj_key);
+
+        obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
+      } catch (buffer::error& err) {
+       ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
+      }
+    }
+  }
+
+  if (!op.size())
+    return 0;
+
+  bufferlist bl;
+  RGWRados::Bucket bop(this, bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, obj->get_obj());
+
+  if (state) {
+    string tag;
+    append_rand_alpha(cct, tag, tag, 32);
+    state->write_tag = tag;
+    r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+
+    if (r < 0)
+      return r;
+
+    bl.append(tag.c_str(), tag.size() + 1);
+    op.setxattr(RGW_ATTR_ID_TAG,  bl);
+  }
+
+
+  real_time mtime = real_clock::now();
+  struct timespec mtime_ts = real_clock::to_timespec(mtime);
+  op.mtime2(&mtime_ts);
+  auto& ioctx = ref.pool.ioctx();
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
+  if (state) {
+    if (r >= 0) {
+      bufferlist acl_bl = attrs[RGW_ATTR_ACL];
+      bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
+      bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
+      string etag = rgw_bl_str(etag_bl);
+      string content_type = rgw_bl_str(content_type_bl);
+      string storage_class;
+      auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
+      if (iter != attrs.end()) {
+        storage_class = rgw_bl_str(iter->second);
+      }
+      uint64_t epoch = ioctx.get_last_version();
+      int64_t poolid = ioctx.get_id();
+      r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
+                            mtime, etag, content_type, storage_class, &acl_bl,
+                            RGWObjCategory::Main, NULL);
+    } else {
+      int ret = index_op.cancel(dpp, nullptr);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
+      }
+    }
+  }
+  if (r < 0)
+    return r;
+
+  if (state) {
+    state->obj_tag.swap(bl);
+    if (rmattrs) {
+      for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+        state->attrset.erase(iter->first);
+      }
+    }
+
+    for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+      state->attrset[iter->first] = iter->second;
+    }
+
+    auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
+    if (iter != state->attrset.end()) {
+      iter->second = state->obj_tag;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWRados *store = source->get_store();
+  CephContext *cct = store->ctx();
+
+  bufferlist etag;
+
+  map<string, bufferlist>::iterator iter;
+
+  RGWObjState *astate;
+  RGWObjManifest *manifest = nullptr;
+  int r = source->get_state(dpp, &astate, &manifest, true, y);
+  if (r < 0)
+    return r;
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  const RGWBucketInfo& bucket_info = source->get_bucket_info();
+
+  state.obj = astate->obj;
+  store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
+
+  state.cur_pool = state.head_obj.pool;
+  state.cur_ioctx = &state.io_ctxs[state.cur_pool];
+
+  r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
+  if (r < 0) {
+    return r;
+  }
+  if (params.target_obj) {
+    *params.target_obj = state.obj;
+  }
+  if (params.attrs) {
+    *params.attrs = astate->attrset;
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
+        ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
+      }
+    }
+  }
+
+  /* Convert all times go GMT to make them compatible */
+  if (conds.mod_ptr || conds.unmod_ptr) {
+    obj_time_weight src_weight;
+    src_weight.init(astate);
+    src_weight.high_precision = conds.high_precision_time;
+
+    obj_time_weight dest_weight;
+    dest_weight.high_precision = conds.high_precision_time;
+
+    if (conds.mod_ptr && !conds.if_nomatch) {
+      dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+      ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+      if (!(dest_weight < src_weight)) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+
+    if (conds.unmod_ptr && !conds.if_match) {
+      dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+      ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+      if (dest_weight < src_weight) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+  }
+  if (conds.if_match || conds.if_nomatch) {
+    r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
+    if (r < 0)
+      return r;
+
+    if (conds.if_match) {
+      string if_match_str = rgw_string_unquote(conds.if_match);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
+      if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+
+    if (conds.if_nomatch) {
+      string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
+      if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+  }
+
+  if (params.obj_size)
+    *params.obj_size = astate->size;
+  if (params.lastmod)
+    *params.lastmod = astate->mtime;
+
+  return 0;
+}
+
+int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+  if (ofs < 0) {
+    ofs += obj_size;
+    if (ofs < 0)
+      ofs = 0;
+    end = obj_size - 1;
+  } else if (end < 0) {
+    end = obj_size - 1;
+  }
+
+  if (obj_size > 0) {
+    if (ofs >= (off_t)obj_size) {
+      return -ERANGE;
+    }
+    if (end >= (off_t)obj_size) {
+      end = obj_size - 1;
+    }
+  }
+  return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call)
+{
+  RGWRados *store = target->get_store();
+  BucketShard *bs = nullptr;
+  int r;
+
+#define NUM_RESHARD_RETRIES 10
+  for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+    int ret = get_bucket_shard(&bs, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << 
+        obj_instance.key << ". ret=" << ret << dendl;
+      return ret;
+    }
+
+    r = call(bs);
+    if (r != -ERR_BUSY_RESHARDING) {
+      break;
+    }
+
+    ldpp_dout(dpp, 10) <<
+      "NOTICE: resharding operation on bucket index detected, blocking. obj=" << 
+      obj_instance.key << dendl;
+
+    r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
+    if (r == -ERR_BUSY_RESHARDING) {
+      ldpp_dout(dpp, 10) << __func__ <<
+       " NOTICE: block_while_resharding() still busy. obj=" <<
+        obj_instance.key << dendl;
+      continue;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR: block_while_resharding() failed. obj=" <<
+        obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
+    i = 0; /* resharding is finished, make sure we can retry */
+    invalidate_bs();
+  } // for loop
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
+      obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (pbs) {
+    *pbs = bs;
+  }
+
+  return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+
+  if (write_tag && write_tag->length()) {
+    optag = string(write_tag->c_str(), write_tag->length());
+  } else {
+    if (optag.empty()) {
+      append_rand_alpha(store->ctx(), optag, optag, 32);
+    }
+  }
+
+  int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int {
+                                  return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
+                                });
+
+  if (r < 0) {
+    return r;
+  }
+  prepared = true;
+
+  return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
+                                            uint64_t size, uint64_t accounted_size,
+                                            ceph::real_time& ut, const string& etag,
+                                            const string& content_type, const string& storage_class,
+                                            bufferlist *acl_bl,
+                                            RGWObjCategory category,
+                                            list<rgw_obj_index_key> *remove_objs, const string *user_data,
+                                            bool appendable)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+  BucketShard *bs = nullptr;
+
+  int ret = get_bucket_shard(&bs, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+    return ret;
+  }
+
+  rgw_bucket_dir_entry ent;
+  obj.key.get_index_key(&ent.key);
+  ent.meta.size = size;
+  ent.meta.accounted_size = accounted_size;
+  ent.meta.mtime = ut;
+  ent.meta.etag = etag;
+  ent.meta.storage_class = storage_class;
+  if (user_data)
+    ent.meta.user_data = *user_data;
+
+  ACLOwner owner;
+  if (acl_bl && acl_bl->length()) {
+    int ret = store->decode_policy(dpp, *acl_bl, &owner);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
+    }
+  }
+  ent.meta.owner = owner.get_id().to_str();
+  ent.meta.owner_display_name = owner.get_display_name();
+  ent.meta.content_type = content_type;
+  ent.meta.appendable = appendable;
+
+  ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+
+  add_datalog_entry(dpp, store->svc.datalog_rados,
+                    target->bucket_info, bs->shard_id);
+
+  return ret;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
+                                                int64_t poolid, uint64_t epoch,
+                                                real_time& removed_mtime,
+                                                list<rgw_obj_index_key> *remove_objs)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+  BucketShard *bs = nullptr;
+
+  int ret = get_bucket_shard(&bs, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+    return ret;
+  }
+
+  ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
+
+  add_datalog_entry(dpp, store->svc.datalog_rados,
+                    target->bucket_info, bs->shard_id);
+
+  return ret;
+}
+
+
+int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
+                                          list<rgw_obj_index_key> *remove_objs)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+  BucketShard *bs;
+
+  int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int {
+                                return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
+                              });
+
+  /*
+   * need to update data log anyhow, so that whoever follows needs to update its internal markers
+   * for following the specific bucket shard log. Otherwise they end up staying behind, and users
+   * have no way to tell that they're all caught up
+   */
+  add_datalog_entry(dpp, store->svc.datalog_rados,
+                    target->bucket_info, bs->shard_id);
+
+  return ret;
+}
+
+/*
+ * Read up through index `end` inclusive. Number of bytes read is up
+ * to `end - ofs + 1`.
+ */
+int RGWRados::Object::Read::read(int64_t ofs, int64_t end,
+                                bufferlist& bl, optional_yield y,
+                                const DoutPrefixProvider *dpp)
+{
+  RGWRados *store = source->get_store();
+
+  rgw_raw_obj read_obj;
+  uint64_t read_ofs = ofs;
+  uint64_t len, read_len;
+  bool reading_from_head = true;
+  ObjectReadOperation op;
+
+  bool merge_bl = false;
+  bufferlist *pbl = &bl;
+  bufferlist read_bl;
+  uint64_t max_chunk_size;
+
+  RGWObjState *astate;
+  RGWObjManifest *manifest = nullptr;
+  int r = source->get_state(dpp, &astate, &manifest, true, y);
+  if (r < 0)
+    return r;
+
+  if (astate->size == 0) {
+    end = 0;
+  } else if (end >= (int64_t)astate->size) {
+    end = astate->size - 1;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  if (manifest && manifest->has_tail()) {
+    /* now get the relevant object part */
+    RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+    uint64_t stripe_ofs = iter.get_stripe_ofs();
+    read_obj = iter.get_location().get_raw_obj(store->driver);
+    len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+    read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+    reading_from_head = (read_obj == state.head_obj);
+  } else {
+    read_obj = state.head_obj;
+  }
+
+  r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
+    return r;
+  }
+
+  if (len > max_chunk_size)
+    len = max_chunk_size;
+
+
+  read_len = len;
+
+  if (reading_from_head) {
+    /* only when reading from the head object do we need to do the atomic test */
+    std::unique_ptr<rgw::sal::Object> obj = source->bucket->get_object(state.obj.key);
+    r = store->append_atomic_test(dpp, source->get_bucket_info(), obj.get(), op, &astate, &manifest, y);
+    if (r < 0)
+      return r;
+
+    if (astate && astate->prefetch_data) {
+      if (!ofs && astate->data.length() >= len) {
+        bl = astate->data;
+        return bl.length();
+      }
+
+      if (ofs < astate->data.length()) {
+        unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
+        astate->data.begin(ofs).copy(copy_len, bl);
+        read_len -= copy_len;
+        read_ofs += copy_len;
+        if (!read_len)
+         return bl.length();
+
+        merge_bl = true;
+        pbl = &read_bl;
+      }
+    }
+  }
+
+  ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
+  op.read(read_ofs, read_len, pbl, NULL);
+
+  if (state.cur_pool != read_obj.pool) {
+    auto iter = state.io_ctxs.find(read_obj.pool);
+    if (iter == state.io_ctxs.end()) {
+      state.cur_ioctx = &state.io_ctxs[read_obj.pool];
+      r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
+      if (r < 0) {
+        ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
+        return r;
+      }
+    } else {
+      state.cur_ioctx = &iter->second;
+    }
+    state.cur_pool = read_obj.pool;
+  }
+
+  state.cur_ioctx->locator_set_key(read_obj.loc);
+
+  r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
+  ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
+
+  if (r < 0) {
+    return r;
+  }
+
+  if (merge_bl) {
+    bl.append(read_bl);
+  }
+
+  return bl.length();
+}
+
+int get_obj_data::flush(rgw::AioResultList&& results) {
+  int r = rgw::check_for_errors(results);
+  if (r < 0) {
+    return r;
+  }
+  std::list<bufferlist> bl_list;
+
+  auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
+  results.sort(cmp); // merge() requires results to be sorted first
+  completed.merge(results, cmp); // merge results in sorted order
+
+  while (!completed.empty() && completed.front().id == offset) {
+    auto bl = std::move(completed.front().data);
+
+    bl_list.push_back(bl);
+    offset += bl.length();
+    int r = client_cb->handle_data(bl, 0, bl.length());
+    if (r < 0) {
+      return r;
+    }
+
+    if (rgwrados->get_use_datacache()) {
+      const std::lock_guard l(d3n_get_data.d3n_lock);
+      auto oid = completed.front().obj.get_ref().obj.oid;
+      if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
+        lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
+        rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
+      } else {
+        lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
+      }
+    }
+    completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
+  }
+  return 0;
+}
+
+static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+                               const rgw_raw_obj& read_obj, off_t obj_ofs,
+                               off_t read_ofs, off_t len, bool is_head_obj,
+                               RGWObjState *astate, void *arg)
+{
+  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+  return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
+                                      is_head_obj, astate, arg);
+}
+
+int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+                                 const rgw_raw_obj& read_obj, off_t obj_ofs,
+                                 off_t read_ofs, off_t len, bool is_head_obj,
+                                 RGWObjState *astate, void *arg)
+{
+  ObjectReadOperation op;
+  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+  string oid, key;
+
+  if (is_head_obj) {
+    /* only when reading from the head object do we need to do the atomic test */
+    int r = append_atomic_test(dpp, astate, op);
+    if (r < 0)
+      return r;
+
+    if (astate &&
+        obj_ofs < astate->data.length()) {
+      unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+      r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+      if (r < 0)
+        return r;
+
+      len -= chunk_len;
+      d->offset += chunk_len;
+      read_ofs += chunk_len;
+      obj_ofs += chunk_len;
+      if (!len)
+         return 0;
+    }
+  }
+
+  auto obj = d->rgwrados->svc.rados->obj(read_obj);
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
+    return r;
+  }
+
+  ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+  op.read(read_ofs, len, nullptr, nullptr);
+
+  const uint64_t cost = len;
+  const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+  auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+
+  return d->flush(std::move(completed));
+}
+
+int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
+                                    optional_yield y)
+{
+  RGWRados *store = source->get_store();
+  CephContext *cct = store->ctx();
+  const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
+  const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
+
+  auto aio = rgw::make_throttle(window_size, y);
+  get_obj_data data(store, cb, &*aio, ofs, y);
+
+  int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(),
+                            source->get_target(),
+                             ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
+    data.cancel(); // drain completions without writing back to client
+    return r;
+  }
+
+  return data.drain();
+}
+
+int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+                          RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+                          off_t ofs, off_t end, uint64_t max_chunk_size,
+                          iterate_obj_cb cb, void *arg, optional_yield y)
+{
+  rgw_raw_obj head_obj;
+  rgw_raw_obj read_obj;
+  uint64_t read_ofs = ofs;
+  uint64_t len;
+  bool reading_from_head = true;
+  RGWObjState *astate = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &head_obj);
+
+  int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  if (manifest) {
+    /* now get the relevant object stripe */
+    RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+    RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp);
+
+    for (; iter != obj_end && ofs <= end; ++iter) {
+      off_t stripe_ofs = iter.get_stripe_ofs();
+      off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
+
+      while (ofs < next_stripe_ofs && ofs <= end) {
+        read_obj = iter.get_location().get_raw_obj(driver);
+        uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+        read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+
+        if (read_len > max_chunk_size) {
+          read_len = max_chunk_size;
+        }
+
+        reading_from_head = (read_obj == head_obj);
+        r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
+       if (r < 0) {
+         return r;
+        }
+
+       len -= read_len;
+        ofs += read_len;
+      }
+    }
+  } else {
+    while (ofs <= end) {
+      read_obj = head_obj;
+      uint64_t read_len = std::min(len, max_chunk_size);
+
+      r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
+      if (r < 0) {
+       return r;
+      }
+
+      len -= read_len;
+      ofs += read_len;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  bufferlist outbl;
+
+  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
+}
+
+int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
+{
+  ObjectWriteOperation op;
+
+  ceph_assert(olh_obj.key.instance.empty());
+
+  bool has_tag = (state.exists && has_olh_tag(state.attrset));
+
+  if (!state.exists) {
+    op.create(true);
+  } else {
+    op.assert_exists();
+    struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+    op.mtime2(&mtime_ts);
+  }
+
+  /*
+   * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
+   * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
+   * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
+   * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
+   * log will reflect that.
+   *
+   * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
+   * is used for object data instance, olh_tag for olh instance.
+   */
+  if (has_tag) {
+    /* guard against racing writes */
+    bucket_index_guard_olh_op(dpp, state, op);
+  }
+
+  if (!has_tag) {
+    /* obj tag */
+    string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+    bufferlist bl;
+    bl.append(obj_tag.c_str(), obj_tag.size());
+    op.setxattr(RGW_ATTR_ID_TAG, bl);
+
+    state.attrset[RGW_ATTR_ID_TAG] = bl;
+    state.obj_tag = bl;
+
+    /* olh tag */
+    string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+    bufferlist olh_bl;
+    olh_bl.append(olh_tag.c_str(), olh_tag.size());
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
+
+    state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
+    state.olh_tag = olh_bl;
+    state.is_olh = true;
+
+    bufferlist verbl;
+    op.setxattr(RGW_ATTR_OLH_VER, verbl);
+  }
+
+  bufferlist bl;
+  RGWOLHPendingInfo pending_info;
+  pending_info.time = real_clock::now();
+  encode(pending_info, bl);
+
+#define OLH_PENDING_TAG_LEN 32
+  /* tag will start with current time epoch, this so that entries are sorted by time */
+  char buf[32];
+  utime_t ut(pending_info.time);
+  snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
+  *op_tag = buf;
+
+  string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
+
+  op_tag->append(s);
+
+  string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+  attr_name.append(*op_tag);
+
+  op.setxattr(attr_name.c_str(), bl);
+
+  int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
+  if (ret < 0) {
+    return ret;
+  }
+
+  state.exists = true;
+  state.attrset[attr_name] = bl;
+
+  return 0;
+}
+
+int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
+{
+  int ret;
+
+  ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
+  if (ret == -EEXIST) {
+    ret = -ECANCELED;
+  }
+
+  return ret;
+}
+
+int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
+                            BucketShard *bs,
+                           const rgw_obj& obj_instance,
+                           RGWBucketInfo& bucket_info,
+                           std::function<int(BucketShard *)> call)
+{
+  rgw_obj obj;
+  const rgw_obj *pobj = &obj_instance;
+  int r;
+
+  for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+    r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
+      return r;
+    }
+
+    r = call(bs);
+    if (r != -ERR_BUSY_RESHARDING) {
+      break;
+    }
+
+    ldpp_dout(dpp, 10) <<
+      "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
+      obj_instance.key << dendl;
+
+    r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp);
+    if (r == -ERR_BUSY_RESHARDING) {
+      ldpp_dout(dpp, 10) << __func__ <<
+       " NOTICE: block_while_resharding() still busy. obj=" <<
+        obj_instance.key << dendl;
+      continue;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR: block_while_resharding() failed. obj=" <<
+        obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
+    i = 0; /* resharding is finished, make sure we can retry */
+  } // for loop
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
+      obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+                                     const rgw_obj& obj_instance,
+                                     RGWBucketInfo& bucket_info,
+                                     optional_yield y,
+                                     const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+  cls_rgw_bucket_instance_entry entry;
+
+  // gets loaded by fetch_new_bucket_info; can be used by
+  // clear_resharding
+  std::map<std::string, bufferlist> bucket_attrs;
+
+  // since we want to run this recovery code from two distinct places,
+  // let's just put it in a lambda so we can easily re-use; if the
+  // lambda successfully fetches a new bucket id, it sets
+  // new_bucket_id and returns 0, otherwise it returns a negative
+  // error code
+  auto fetch_new_bucket_info =
+    [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int {
+    int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name,
+                             bucket_info, nullptr, y, dpp, &bucket_attrs);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR: failed to refresh bucket info after reshard at " <<
+       log_tag << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    ret = bs->init(dpp, bucket_info, obj_instance);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR: failed to refresh bucket shard generation after reshard at " <<
+       log_tag << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen;
+    ldpp_dout(dpp, 20) << __func__ <<
+      " INFO: refreshed bucket info after reshard at " <<
+      log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl;
+
+    return 0;
+  }; // lambda fetch_new_bucket_info
+
+  constexpr int num_retries = 10;
+  for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
+    auto& ref = bs->bucket_obj.get_ref();
+    ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
+    if (ret == -ENOENT) {
+      ret = fetch_new_bucket_info("get_bucket_resharding_failed");
+      if (ret < 0) {
+       ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+         " failed to refresh bucket info after reshard when get bucket "
+         "resharding failed, error: " << cpp_strerror(-ret) << dendl;
+       return ret;
+      }
+    } else if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
+       dendl;
+      return ret;
+    }
+
+    if (!entry.resharding_in_progress()) {
+      ret = fetch_new_bucket_info("get_bucket_resharding_succeeded");
+      if (ret < 0) {
+       ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+         " failed to refresh bucket info after reshard when get bucket "
+         "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
+       return ret;
+      }
+    }
+
+    ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " <<
+      (i < num_retries ? "retrying" : "too many retries") << dendl;
+
+    if (i == num_retries) {
+      break;
+    }
+
+    // If bucket is erroneously marked as resharding (e.g., crash or
+    // other error) then fix it. If we can take the bucket reshard
+    // lock then it means no other resharding should be taking place,
+    // and we're free to clear the flags.
+    {
+      // since we expect to do this rarely, we'll do our work in a
+      // block and erase our work after each try
+
+      RGWObjectCtx obj_ctx(this->driver);
+      const rgw_bucket& b = bs->bucket;
+      std::string bucket_id = b.get_key();
+      RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true);
+      ret = reshard_lock.lock(dpp);
+      if (ret == -ENOENT) {
+       continue;
+      } else if (ret < 0) {
+       ldpp_dout(dpp, 20) << __func__ <<
+         " ERROR: failed to take reshard lock for bucket " <<
+         bucket_id << "; expected if resharding underway" << dendl;
+      } else {
+       ldpp_dout(dpp, 10) << __func__ <<
+         " INFO: was able to take reshard lock for bucket " <<
+         bucket_id << dendl;
+        // the reshard may have finished, so call clear_resharding()
+        // with its current bucket info; ALSO this will load
+        // bucket_attrs for call to clear_resharding below
+        ret = fetch_new_bucket_info("trying_to_clear_resharding");
+        if (ret < 0) {
+         reshard_lock.unlock();
+         ldpp_dout(dpp, 0) << __func__ <<
+           " ERROR: failed to update bucket info before clear resharding for bucket " <<
+           bucket_id << dendl;
+          continue; // try again
+        }
+
+       ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp);
+       reshard_lock.unlock();
+       if (ret == -ENOENT) {
+         ldpp_dout(dpp, 5) << __func__ <<
+           " INFO: no need to reset reshard flags; old shards apparently"
+           " removed after successful resharding of bucket " <<
+           bucket_id << dendl;
+         continue; // immediately test again
+       } else if (ret < 0) {
+         ldpp_dout(dpp, 0) << __func__ <<
+           " ERROR: failed to clear resharding flags for bucket " <<
+           bucket_id << ", " << cpp_strerror(-ret) << dendl;
+         // wait and then test again
+       } else {
+         ldpp_dout(dpp, 5) << __func__ <<
+           " INFO: apparently successfully cleared resharding flags for "
+           "bucket " << bucket_id << dendl;
+         continue; // if we apparently succeed immediately test again
+       } // if clear resharding succeeded
+      } // if taking of lock succeeded
+    } // block to encapsulate recovery from incomplete reshard
+
+    ret = reshard_wait->wait(y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR: bucket is still resharding, please retry" << dendl;
+      return ret;
+    }
+  } // for loop
+
+  ldpp_dout(dpp, 0) << __func__ <<
+    " ERROR: bucket is still resharding, please retry" << dendl;
+  return -ERR_BUSY_RESHARDING;
+}
+
+int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                                    RGWObjState& olh_state, const rgw_obj& obj_instance,
+                                    bool delete_marker, const string& op_tag,
+                                    struct rgw_bucket_dir_entry_meta *meta,
+                                    uint64_t olh_epoch,
+                                    real_time unmod_since, bool high_precision_time,
+                                    rgw_zone_set *_zones_trace, bool log_data_change)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+  BucketShard bs(this);
+
+  r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+                   [&](BucketShard *bs) -> int {
+                     cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+                     auto& ref = bs->bucket_obj.get_ref();
+                     librados::ObjectWriteOperation op;
+                     op.assert_exists(); // bucket index shard must exist
+                     cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                     cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
+                                              delete_marker, op_tag, meta, olh_epoch,
+                                             unmod_since, high_precision_time,
+                                             svc.zone->get_zone().log_data, zones_trace);
+                      return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+                    });
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
+    return r;
+  }
+
+  add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id);
+
+  return 0;
+}
+
+void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
+{
+  ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
+  op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
+}
+
+int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+                                           RGWBucketInfo& bucket_info,
+                                           const rgw_obj& obj_instance,
+                                           const string& op_tag, const string& olh_tag,
+                                           uint64_t olh_epoch, rgw_zone_set *_zones_trace)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+  BucketShard bs(this);
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+  r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+                   [&](BucketShard *bs) -> int {
+                     auto& ref = bs->bucket_obj.get_ref();
+                     librados::ObjectWriteOperation op;
+                     op.assert_exists(); // bucket index shard must exist
+                     cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                     cls_rgw_bucket_unlink_instance(op, key, op_tag,
+                                                    olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
+                      return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+                    });
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+                                        RGWBucketInfo& bucket_info, RGWObjState& state,
+                                        const rgw_obj& obj_instance, uint64_t ver_marker,
+                                        std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log,
+                                        bool *is_truncated)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  BucketShard bs(this);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+  auto& shard_ref = bs.bucket_obj.get_ref();
+  ObjectReadOperation op;
+
+  rgw_cls_read_olh_log_ret log_ret;
+  int op_ret = 0;
+  cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret); 
+  bufferlist outbl;
+  r =  rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
+  if (r < 0) {
+    return r;
+  }
+  if (op_ret < 0) {
+    ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
+    return op_ret;
+  }
+
+  *log = std::move(log_ret.log);
+  *is_truncated = log_ret.is_truncated;
+
+  return 0;
+}
+
+// a multisite sync bug resulted in the OLH head attributes being overwritten by
+// the attributes from another zone, causing link_olh() to fail endlessly due to
+// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
+// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
+int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+                         const rgw_obj& obj)
+{
+  // fetch the current olh entry from the bucket index
+  rgw_bucket_olh_entry olh;
+  int r = bi_get_olh(dpp, bucket_info, obj, &olh);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
+    return r;
+  }
+  if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
+    return 0;
+  }
+
+  ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
+      << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
+
+  // rewrite OLH_ID_TAG and OLH_INFO from current olh
+  ObjectWriteOperation op;
+  // assert this is the same olh tag we think we're fixing
+  bucket_index_guard_olh_op(dpp, *state, op);
+  // preserve existing mtime
+  struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
+  op.mtime2(&mtime_ts);
+  {
+    bufferlist bl;
+    bl.append(olh.tag.c_str(), olh.tag.size());
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
+  }
+  {
+    RGWOLHInfo info;
+    info.target = rgw_obj(bucket_info.bucket, olh.key);
+    info.removed = olh.delete_marker;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+  rgw_rados_ref ref;
+  r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp,
+                                        RGWBucketInfo& bucket_info,
+                                        RGWObjState& state,
+                                        const rgw_obj& obj_instance, uint64_t ver)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  BucketShard bs(this);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+  ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+                     [&](BucketShard *pbs) -> int {
+                       ObjectWriteOperation op;
+                       op.assert_exists(); // bucket index shard must exist
+                       cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                       cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+                        return pbs->bucket_obj.operate(dpp, &op, null_yield);
+                      });
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp,
+                                     RGWBucketInfo& bucket_info,
+                                     RGWObjState& state,
+                                     const rgw_obj& obj_instance)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  BucketShard bs(this);
+
+  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+  int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+                         [&](BucketShard *pbs) -> int {
+                           ObjectWriteOperation op;
+                           op.assert_exists(); // bucket index shard must exist
+                           auto& ref = pbs->bucket_obj.get_ref();
+                           cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                           cls_rgw_clear_olh(op, key, olh_tag);
+                            return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+                          });
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
+{
+  try {
+    auto biter = bl.cbegin();
+    decode(*olh, biter);
+    return 0;
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
+    return -EIO;
+  }
+}
+
+int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
+                           RGWObjState& state,
+                           RGWBucketInfo& bucket_info,
+                           const rgw::sal::Object* obj,
+                           bufferlist& olh_tag,
+                           std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+                           uint64_t *plast_ver,
+                           rgw_zone_set* zones_trace)
+{
+  if (log.empty()) {
+    return 0;
+  }
+
+  librados::ObjectWriteOperation op;
+
+  uint64_t last_ver = log.rbegin()->first;
+  *plast_ver = last_ver;
+
+  map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
+
+  op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
+  op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
+
+  bufferlist ver_bl;
+  string last_ver_s = to_string(last_ver);
+  ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
+  op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
+
+  struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+  op.mtime2(&mtime_ts);
+
+  bool need_to_link = false;
+  uint64_t link_epoch = 0;
+  cls_rgw_obj_key key;
+  bool delete_marker = false;
+  list<cls_rgw_obj_key> remove_instances;
+  bool need_to_remove = false;
+
+  // decode current epoch and instance
+  auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
+  if (olh_ver != state.attrset.end()) {
+    std::string str = olh_ver->second.to_str();
+    std::string err;
+    link_epoch = strict_strtoll(str.c_str(), 10, &err);
+  }
+  auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
+  if (olh_info != state.attrset.end()) {
+    RGWOLHInfo info;
+    int r = decode_olh_info(dpp, cct, olh_info->second, &info);
+    if (r < 0) {
+      return r;
+    }
+    info.target.key.get_index_key(&key);
+    delete_marker = info.removed;
+  }
+
+  for (iter = log.begin(); iter != log.end(); ++iter) {
+    vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
+    for (; viter != iter->second.end(); ++viter) {
+      rgw_bucket_olh_log_entry& entry = *viter;
+
+      ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
+                     << " key=" << entry.key.name << "[" << entry.key.instance << "] "
+                     << (entry.delete_marker ? "(delete)" : "") << dendl;
+      switch (entry.op) {
+      case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
+        remove_instances.push_back(entry.key);
+        break;
+      case CLS_RGW_OLH_OP_LINK_OLH:
+        // only overwrite a link of the same epoch if its key sorts before
+        if (link_epoch < iter->first || key.instance.empty() ||
+            key.instance > entry.key.instance) {
+          ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+              << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+          need_to_link = true;
+          need_to_remove = false;
+          key = entry.key;
+          delete_marker = entry.delete_marker;
+        } else {
+          ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+              << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+        }
+        break;
+      case CLS_RGW_OLH_OP_UNLINK_OLH:
+        need_to_remove = true;
+        need_to_link = false;
+        break;
+      default:
+        ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
+        return -EIO;
+      }
+      string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+      attr_name.append(entry.op_tag);
+      op.rmxattr(attr_name.c_str());
+    }
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw::sal::Bucket* bucket = obj->get_bucket();
+
+  if (need_to_link) {
+    rgw_obj target(bucket->get_key(), key);
+    RGWOLHInfo info;
+    info.target = target;
+    info.removed = delete_marker;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+
+  /* first remove object instances */
+  for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
+       liter != remove_instances.end(); ++liter) {
+    cls_rgw_obj_key& key = *liter;
+    std::unique_ptr<rgw::sal::Object> obj_instance = bucket->get_object(key);
+    int ret = delete_obj(dpp, bucket_info, obj_instance.get(), 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
+      return ret;
+    }
+  }
+
+  /* update olh object */
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+    return r;
+  }
+
+  r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj->get_obj(), last_ver);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
+    return r;
+  }
+
+  if (need_to_remove) {
+    ObjectWriteOperation rm_op;
+
+    rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
+    rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
+    cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
+    rm_op.remove();
+
+    r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
+    if (r == -ECANCELED) {
+      return 0; /* someone else won this race */
+    } else {
+      /* 
+       * only clear if was successful, otherwise we might clobber pending operations on this object
+       */
+      r = bucket_index_clear_olh(dpp, bucket_info, state, obj->get_obj());
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
+        return r;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * read olh log and apply it
+ */
+int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace)
+{
+  map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
+  bool is_truncated;
+  uint64_t ver_marker = 0;
+
+  do {
+    int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj->get_obj(), ver_marker, &log, &is_truncated);
+    if (ret < 0) {
+      return ret;
+    }
+    ret = apply_olh_log(dpp, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
+    if (ret < 0) {
+      return ret;
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+                     RGWBucketInfo& bucket_info,
+                     rgw::sal::Object* target_obj, bool delete_marker,
+                     rgw_bucket_dir_entry_meta *meta,
+                      uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
+                      optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
+{
+  string op_tag;
+
+  std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
+  olh_obj->clear_instance();
+
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  int ret = 0;
+  int i;
+
+#define MAX_ECANCELED_RETRY 100
+  for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+    if (ret == -ECANCELED) {
+      olh_obj->invalidate();
+    }
+
+    ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj.get(), &state, &manifest, false, y); /* don't follow olh */
+    if (ret < 0) {
+      return ret;
+    }
+
+    ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        continue;
+      }
+      return ret;
+    }
+    ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj->get_obj(),
+                               delete_marker, op_tag, meta, olh_epoch, unmod_since,
+                               high_precision_time, zones_trace, log_data_change);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        // the bucket index rejected the link_olh() due to olh tag mismatch;
+        // attempt to reconstruct olh head attributes based on the bucket index
+        int r2 = repair_olh(dpp, state, bucket_info, olh_obj->get_obj());
+        if (r2 < 0 && r2 != -ECANCELED) {
+          return r2;
+        }
+        continue;
+      }
+      return ret;
+    }
+    break;
+  }
+
+  if (i == MAX_ECANCELED_RETRY) {
+    ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+    return -EIO;
+  }
+
+  ret = update_olh(dpp, state, bucket_info, olh_obj.get());
+  if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+    ret = 0;
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
+                                  uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
+{
+  string op_tag;
+
+  std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
+  olh_obj->clear_instance();
+
+  RGWObjState *state = NULL;
+
+  int ret = 0;
+  int i;
+
+  for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+    if (ret == -ECANCELED) {
+      olh_obj->invalidate();
+    }
+
+    ret = olh_obj->get_obj_state(dpp, &state, y, false); /* don't follow olh */
+    if (ret < 0)
+      return ret;
+
+    ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        continue;
+      }
+      return ret;
+    }
+
+    string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
+
+    ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj->get_obj(), op_tag, olh_tag, olh_epoch, zones_trace);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        continue;
+      }
+      return ret;
+    }
+    break;
+  }
+
+  if (i == MAX_ECANCELED_RETRY) {
+    ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+    return -EIO;
+  }
+
+  ret = update_olh(dpp, state, bucket_info, olh_obj.get(), zones_trace);
+  if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+    return 0;
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
+{
+#define OBJ_INSTANCE_LEN 32
+  char buf[OBJ_INSTANCE_LEN + 1];
+
+  gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
+                                                                      no underscore for instance name due to the way we encode the raw keys */
+
+  target_key->set_instance(buf);
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
+{
+  gen_rand_obj_instance_name(&target_obj->key);
+}
+
+int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
+{
+  map<string, bufferlist> attrset;
+
+  ObjectReadOperation op;
+  op.getxattrs(&attrset, NULL);
+
+  int r = obj_operate(dpp, bucket_info, obj, &op);
+  if (r < 0) {
+    return r;
+  }
+
+  auto iter = attrset.find(RGW_ATTR_OLH_INFO);
+  if (iter == attrset.end()) { /* not an olh */
+    return -EINVAL;
+  }
+
+  return decode_olh_info(dpp, cct, iter->second, olh);
+}
+
+void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp,
+                                        map<string, bufferlist>& pending_entries,
+                                         map<string, bufferlist> *rm_pending_entries)
+{
+  map<string, bufferlist>::iterator iter = pending_entries.begin();
+
+  real_time now = real_clock::now();
+
+  while (iter != pending_entries.end()) {
+    auto biter = iter->second.cbegin();
+    RGWOLHPendingInfo pending_info;
+    try {
+      decode(pending_info, biter);
+    } catch (buffer::error& err) {
+      /* skipping bad entry, we could remove it but it might hide a bug */
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
+      ++iter;
+      continue;
+    }
+
+    map<string, bufferlist>::iterator cur_iter = iter;
+    ++iter;
+    if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
+      (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
+      pending_entries.erase(cur_iter);
+    } else {
+      /* entries names are sorted by time (rounded to a second) */
+      break;
+    }
+  }
+}
+
+int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  // trim no more than 1000 entries per osd op
+  constexpr int max_entries = 1000;
+
+  auto i = pending_attrs.begin();
+  while (i != pending_attrs.end()) {
+    ObjectWriteOperation op;
+    bucket_index_guard_olh_op(dpp, state, op);
+
+    for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
+      op.rmxattr(i->first.c_str());
+    }
+
+    r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+    if (r == -ENOENT || r == -ECANCELED) {
+      /* raced with some other change, shouldn't sweat about it */
+      return 0;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target)
+{
+  map<string, bufferlist> pending_entries;
+  rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
+
+  map<string, bufferlist> rm_pending_entries;
+  check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
+
+  if (!rm_pending_entries.empty()) {
+    int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj->get_obj(), rm_pending_entries);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
+      return ret;
+    }
+  }
+  if (!pending_entries.empty()) {
+    ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj->get_bucket() << dendl;
+
+    int ret = update_olh(dpp, state, bucket_info, olh_obj);
+    if (ret < 0) {
+      if (ret == -ECANCELED) {
+        // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object.
+        // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We
+        // return ENOENT to indicate that the OLH object was removed.
+        ret = -ENOENT;
+      }
+      return ret;
+    }
+  }
+
+  auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
+  if (iter == state->attrset.end()) {
+    return -EINVAL;
+  }
+
+  RGWOLHInfo olh;
+  int ret = decode_olh_info(dpp, cct, iter->second, &olh);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (olh.removed) {
+    return -ENOENT;
+  }
+
+  *target = olh.target;
+
+  return 0;
+}
+
+int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
+                           rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
+                           map<string, bufferlist> *attrs, bufferlist *first_chunk,
+                           RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  map<string, bufferlist> unfiltered_attrset;
+  uint64_t size = 0;
+  struct timespec mtime_ts;
+
+  ObjectReadOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_read(&op);
+  }
+  if (attrs) {
+    op.getxattrs(&unfiltered_attrset, NULL);
+  }
+  if (psize || pmtime) {
+    op.stat2(&size, &mtime_ts, NULL);
+  }
+  if (first_chunk) {
+    op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
+  }
+  bufferlist outbl;
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y);
+
+  if (epoch) {
+    *epoch = ref.pool.ioctx().get_last_version();
+  }
+
+  if (r < 0)
+    return r;
+
+  if (psize)
+    *psize = size;
+  if (pmtime)
+    *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+  if (attrs) {
+    rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
+  }
+
+  return 0;
+}
+
+int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp,
+                              RGWBucketInfo& bucket_info,
+                              const rgw::bucket_index_layout_generation& idx_layout,
+                              int shard_id, string *bucket_ver, string *master_ver,
+                              map<RGWObjCategory, RGWStorageStats>& stats,
+                              string *max_marker, bool *syncstopped)
+{
+  vector<rgw_bucket_dir_header> headers;
+  map<int, string> bucket_instance_ids;
+  int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids);
+  if (r < 0) {
+    return r;
+  }
+
+  ceph_assert(headers.size() == bucket_instance_ids.size());
+
+  auto iter = headers.begin();
+  map<int, string>::iterator viter = bucket_instance_ids.begin();
+  BucketIndexShardsManager ver_mgr;
+  BucketIndexShardsManager master_ver_mgr;
+  BucketIndexShardsManager marker_mgr;
+  char buf[64];
+  for(; iter != headers.end(); ++iter, ++viter) {
+    accumulate_raw_stats(*iter, stats);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
+    ver_mgr.add(viter->first, string(buf));
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
+    master_ver_mgr.add(viter->first, string(buf));
+    if (shard_id >= 0) {
+      *max_marker = iter->max_marker;
+    } else {
+      marker_mgr.add(viter->first, iter->max_marker);
+    }
+    if (syncstopped != NULL)
+      *syncstopped = iter->syncstopped;
+  }
+  ver_mgr.to_string(bucket_ver);
+  master_ver_mgr.to_string(master_ver);
+  if (shard_id < 0) {
+    marker_mgr.to_string(max_marker);
+  }
+  return 0;
+}
+
+class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
+  RGWGetBucketStats_CB *cb;
+  uint32_t pendings;
+  map<RGWObjCategory, RGWStorageStats> stats;
+  int ret_code;
+  bool should_cb;
+  ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
+
+public:
+  RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
+    : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
+  {}
+
+  void handle_response(int r, rgw_bucket_dir_header& header) override {
+    std::lock_guard l{lock};
+    if (should_cb) {
+      if ( r >= 0) {
+        accumulate_raw_stats(header, stats);
+      } else {
+        ret_code = r;
+      }
+
+      // Are we all done?
+      if (--pendings == 0) {
+        if (!ret_code) {
+          cb->set_response(&stats);
+        }
+        cb->handle_response(ret_code);
+        cb->put();
+      }
+    }
+  }
+
+  void unset_cb() {
+    std::lock_guard l{lock};
+    should_cb = false;
+  }
+};
+
+int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
+{
+  int num_aio = 0;
+  RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
+  ceph_assert(get_ctx);
+  int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio);
+  if (r < 0) {
+    ctx->put();
+    if (num_aio) {
+      get_ctx->unset_cb();
+    }
+  }
+  get_ctx->put();
+  return r;
+}
+
+int RGWRados::get_bucket_instance_info(const string& meta_key,
+                                      RGWBucketInfo& info,
+                                       real_time *pmtime,
+                                      map<string, bufferlist> *pattrs,
+                                      optional_yield y,
+                                       const DoutPrefixProvider *dpp)
+{
+  rgw_bucket bucket;
+  rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
+
+  return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp);
+}
+
+int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info,
+                                       real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
+                                       const DoutPrefixProvider *dpp)
+{
+  return ctl.bucket->read_bucket_instance_info(bucket, &info,
+                                              y,
+                                               dpp,
+                                              RGWBucketCtl::BucketInstance::GetParams()
+                                              .set_mtime(pmtime)
+                                              .set_attrs(pattrs));
+}
+
+int RGWRados::get_bucket_info(RGWServices *svc,
+                              const string& tenant, const string& bucket_name,
+                              RGWBucketInfo& info,
+                              real_time *pmtime,
+                              optional_yield y,
+                              const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
+{
+  rgw_bucket bucket;
+  bucket.tenant = tenant;
+  bucket.name = bucket_name;
+  return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
+                                     RGWBucketCtl::BucketInstance::GetParams()
+                                     .set_mtime(pmtime)
+                                     .set_attrs(pattrs));
+}
+
+int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
+                                      ceph::real_time *pmtime,
+                                      const DoutPrefixProvider *dpp,
+                                      map<string, bufferlist> *pattrs)
+{
+  rgw_bucket bucket = info.bucket;
+  bucket.bucket_id.clear();
+
+  auto rv = info.objv_tracker.read_version;
+
+  return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
+                                     RGWBucketCtl::BucketInstance::GetParams()
+                                     .set_mtime(pmtime)
+                                     .set_attrs(pattrs)
+                                     .set_refresh_version(rv));
+}
+
+int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
+                              real_time mtime, map<string, bufferlist> *pattrs,
+                              const DoutPrefixProvider *dpp)
+{
+  return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
+                                               RGWBucketCtl::BucketInstance::PutParams()
+                                               .set_exclusive(exclusive)
+                                               .set_mtime(mtime)
+                                               .set_attrs(pattrs));
+}
+
+int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
+                                     map<string, bufferlist> *pattrs, bool create_entry_point,
+                                     const DoutPrefixProvider *dpp)
+{
+  bool create_head = !info.has_instance_obj || create_entry_point;
+
+  int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!create_head)
+    return 0; /* done! */
+
+  RGWBucketEntryPoint entry_point;
+  entry_point.bucket = info.bucket;
+  entry_point.owner = info.owner;
+  entry_point.creation_time = info.creation_time;
+  entry_point.linked = true;
+  RGWObjVersionTracker ot;
+  if (pep_objv && !pep_objv->tag.empty()) {
+    ot.write_version = *pep_objv;
+  } else {
+    ot.generate_new_write_ver(cct);
+    if (pep_objv) {
+      *pep_objv = ot.write_version;
+    }
+  }
+  ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
+                                                                         .set_exclusive(exclusive)
+                                                                         .set_objv_tracker(&ot)
+                                                                         .set_mtime(mtime));
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
+{
+  map<string, RGWBucketEnt>::iterator iter;
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    RGWBucketEnt& ent = iter->second;
+    rgw_bucket& bucket = ent.bucket;
+    ent.count = 0;
+    ent.size = 0;
+    ent.size_rounded = 0;
+
+    vector<rgw_bucket_dir_header> headers;
+
+    RGWBucketInfo bucket_info;
+    int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers);
+    if (r < 0)
+      return r;
+
+    auto hiter = headers.begin();
+    for (; hiter != headers.end(); ++hiter) {
+      RGWObjCategory category = main_category;
+      auto iter = (hiter->stats).find(category);
+      if (iter != hiter->stats.end()) {
+        struct rgw_bucket_category_stats& stats = iter->second;
+        ent.count += stats.num_entries;
+        ent.size += stats.total_size;
+        ent.size_rounded += stats.total_size_rounded;
+      }
+    }
+
+    // fill in placement_rule from the bucket instance for use in swift's
+    // per-storage policy statistics
+    ent.placement_rule = std::move(bucket_info.placement_rule);
+  }
+
+  return m.size();
+}
+
+int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
+{
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  librados::Rados *rad = get_rados_handle();
+  librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
+
+  r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
+  completion->release();
+  return r;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  int r = open_pool_ctx(dpp, pool, io_ctx, false);
+  if (r < 0)
+    return r;
+
+  iter = io_ctx.nobjects_begin();
+
+  return 0;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  int r = open_pool_ctx(dpp, pool, io_ctx, false);
+  if (r < 0)
+    return r;
+
+  librados::ObjectCursor oc;
+  if (!oc.from_str(cursor)) {
+    ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    iter = io_ctx.nobjects_begin(oc);
+    return 0;
+  } catch (const std::system_error& e) {
+    r = -e.code().value();
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
+string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
+{
+  return ctx.iter.get_cursor().to_str();
+}
+
+static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+                           vector<rgw_bucket_dir_entry>& objs,
+                           bool *is_truncated, RGWAccessListFilter *filter)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  if (iter == io_ctx.nobjects_end())
+    return -ENOENT;
+
+  uint32_t i;
+
+  for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
+    rgw_bucket_dir_entry e;
+
+    string oid = iter->get_oid();
+    ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
+
+    // fill it in with initial values; we may correct later
+    if (filter && !filter->filter(oid, oid))
+      continue;
+
+    e.key = oid;
+    objs.push_back(e);
+  }
+
+  if (is_truncated)
+    *is_truncated = (iter != io_ctx.nobjects_end());
+
+  return objs.size();
+}
+
+int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+                           bool *is_truncated, RGWAccessListFilter *filter)
+{
+  // catch exceptions from NObjectIterator::operator++()
+  try {
+    return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
+  } catch (const std::system_error& e) {
+    int r = -e.code().value();
+    ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
+int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
+{
+  if (!ctx->initialized) {
+    int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
+    if (r < 0) {
+      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
+      return r;
+    }
+    ctx->initialized = true;
+  }
+  return 0;
+}
+
+int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
+                                    RGWListRawObjsCtx& ctx, list<string>& oids,
+                                    bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    return -EINVAL;
+  }
+  RGWAccessListFilterPrefix filter(prefix_filter);
+  vector<rgw_bucket_dir_entry> objs;
+  int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
+  if (r < 0) {
+    if(r != -ENOENT)
+      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
+    return r;
+  }
+
+  vector<rgw_bucket_dir_entry>::iterator iter;
+  for (iter = objs.begin(); iter != objs.end(); ++iter) {
+    oids.push_back(iter->key.name);
+  }
+
+  return oids.size();
+}
+
+int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
+                              int max, RGWListRawObjsCtx& ctx, list<string>& oids,
+                              bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    int r = list_raw_objects_init(dpp, pool, string(), &ctx);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
+}
+
+string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
+{
+  return pool_iterate_get_cursor(ctx.iter_ctx);
+}
+
+int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                              rgw_bucket_dir_entry *dirent)
+{
+  rgw_cls_bi_entry bi_entry;
+  int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+  }
+  if (r < 0) {
+    return r;
+  }
+  auto iter = bi_entry.data.cbegin();
+  try {
+    decode(*dirent, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                         rgw_bucket_olh_entry *olh)
+{
+  rgw_cls_bi_entry bi_entry;
+  int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+  }
+  if (r < 0) {
+    return r;
+  }
+  auto iter = bi_entry.data.cbegin();
+  try {
+    decode(*olh, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                     BIIndexType index_type, rgw_cls_bi_entry *entry)
+{
+  BucketShard bs(this);
+  int ret = bs.init(dpp, bucket_info, obj);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+
+  auto& ref = bs.bucket_obj.get_ref();
+  
+  return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
+}
+
+void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  cls_rgw_bi_put(op, ref.obj.oid, entry);
+}
+
+int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
+{
+  // make sure incomplete multipart uploads are hashed correctly
+  if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
+    RGWMPObj mp;
+    mp.from_meta(obj.key.name);
+    obj.index_hash_source = mp.get_key();
+  }
+  BucketShard bs(this);
+
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return bi_put(bs, entry);
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
+                     const string& obj_name_filter, const string& marker, uint32_t max,
+                     list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+  rgw_obj obj(bucket, obj_name_filter);
+  BucketShard bs(this);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  auto& ref = bs.bucket_obj.get_ref();
+  ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+  if (ret == -ENOENT) {
+    *is_truncated = false;
+  }
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
+                     list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp,
+                     const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
+                     list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+  BucketShard bs(this);
+  int ret = bs.init(dpp, bucket_info,
+                   bucket_info.layout.current_index,
+                   shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
+}
+
+int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  int ret = ref.pool.ioctx().remove(ref.obj.oid);
+  if (ret == -ENOENT) {
+    ret = 0;
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
+{
+  return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
+}
+
+int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
+                             librados::ObjectWriteOperation *op)
+{
+  return gc_pool_ctx.aio_operate(oid, c, op);
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
+{
+  return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
+}
+
+int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+  return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
+}
+
+int RGWRados::process_gc(bool expired_only)
+{
+  return gc->process(expired_only);
+}
+
+int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
+                              vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+                              int& index)
+{
+  return lc->list_lc_progress(marker, max_entries, progress_map, index);
+}
+
+int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
+{
+  RGWLC lc;
+  lc.initialize(cct, this->driver);
+  RGWLC::LCWorker worker(&lc, cct, &lc, 0);
+  auto ret = lc.process(&worker, optional_bucket, true /* once */);
+  lc.stop_processor(); // sets down_flag, but returns immediately
+  return ret;
+}
+
+bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
+{
+  return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
+}
+
+int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
+                                 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+  ObjectWriteOperation o;
+  o.assert_exists(); // bucket index shard must exist
+
+  cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+  cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+  cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
+  int ret = bs.bucket_obj.operate(dpp, &o, y);
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+  return ret;
+}
+
+int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
+                                  int64_t pool, uint64_t epoch,
+                                  rgw_bucket_dir_entry& ent, RGWObjCategory category,
+                                 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+    " obj=" << obj << " tag=" << tag << " op=" << op <<
+    ", remove_objs=" << (remove_objs ? *remove_objs : std::list<rgw_obj_index_key>()) << dendl_bitx;
+  ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  ObjectWriteOperation o;
+  o.assert_exists(); // bucket index shard must exist
+
+  rgw_bucket_dir_entry_meta dir_meta;
+  dir_meta = ent.meta;
+  dir_meta.category = category;
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+  rgw_bucket_entry_ver ver;
+  ver.pool = pool;
+  ver.epoch = epoch;
+  cls_rgw_obj_key key(ent.key.name, ent.key.instance);
+  cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+  cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
+                             svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
+  complete_op_data *arg;
+  index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
+                                              svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
+  librados::AioCompletion *completion = arg->rados_completion;
+  int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
+  completion->release(); /* can't reference arg here, as it might have already been released */
+
+  ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+  return ret;
+}
+
+int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
+                                   int64_t pool, uint64_t epoch,
+                                   rgw_bucket_dir_entry& ent, RGWObjCategory category,
+                                   list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
+                                   int64_t pool, uint64_t epoch,
+                                   rgw_obj& obj,
+                                   real_time& removed_mtime,
+                                   list<rgw_obj_index_key> *remove_objs,
+                                   uint16_t bilog_flags,
+                                   rgw_zone_set *zones_trace)
+{
+  rgw_bucket_dir_entry ent;
+  ent.meta.mtime = removed_mtime;
+  obj.key.get_index_key(&ent.key);
+  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
+                            ent, RGWObjCategory::None, remove_objs,
+                            bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
+                                      list<rgw_obj_index_key> *remove_objs,
+                                      uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+  rgw_bucket_dir_entry ent;
+  obj.key.get_index_key(&ent.key);
+  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
+                            -1 /* pool id */, 0, ent,
+                            RGWObjCategory::None, remove_objs, bilog_flags,
+                            zones_trace);
+}
+
+int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+
+  return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
+}
+
+
+// returns 0 if there is an error in calculation
+uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+                                                     uint32_t num_shards)
+{
+  if (num_shards == 0) {
+    // we'll get a floating point exception since we divide by
+    // num_shards
+    return 0;
+  }
+
+  // We want to minimize the chances that when num_shards >>
+  // num_entries that we return much fewer than num_entries to the
+  // client. Given all the overhead of making a cls call to the osd,
+  // returning a few entries is not much more work than returning one
+  // entry. This minimum might be better tuned based on future
+  // experiments where num_shards >> num_entries. (Note: ">>" should
+  // be interpreted as "much greater than".)
+  constexpr uint32_t min_read = 8;
+
+  // The following is based on _"Balls into Bins" -- A Simple and
+  // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
+  // cases when num_shards >> num_entries (it almost serves as a
+  // ceiling calculation). We also assume alpha is 1.0 and extract it
+  // from the calculation. Future work could involve memoizing some of
+  // the transcendental functions to minimize repeatedly re-calling
+  // them with the same parameters, which we expect to be the case the
+  // majority of the time.
+  uint32_t calc_read =
+    1 +
+    static_cast<uint32_t>((num_entries / num_shards) +
+                         sqrt((2 * num_entries) *
+                              log(num_shards) / num_shards));
+
+  return std::max(min_read, calc_read);
+}
+
+
+int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+                                      RGWBucketInfo& bucket_info,
+                                      const rgw::bucket_index_layout_generation& idx_layout,
+                                      const int shard_id,
+                                     const rgw_obj_index_key& start_after,
+                                     const std::string& prefix,
+                                     const std::string& delimiter,
+                                     const uint32_t num_entries,
+                                     const bool list_versions,
+                                     const uint16_t expansion_factor,
+                                     ent_map_t& m,
+                                     bool* is_truncated,
+                                     bool* cls_filtered,
+                                     rgw_obj_index_key* last_entry,
+                                      optional_yield y,
+                                     RGWBucketListNameFilter force_check_filter)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+  /* expansion_factor allows the number of entries to read to grow
+   * exponentially; this is used when earlier reads are producing too
+   * few results, perhaps due to filtering or to a series of
+   * namespaced entries */
+
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+    " start_after=\"" << start_after.to_string() <<
+    "\", prefix=\"" << prefix <<
+    ", delimiter=\"" << delimiter <<
+    "\", shard_id=" << shard_id <<
+    "\", num_entries=" << num_entries <<
+    ", shard_id=" << shard_id <<
+    ", list_versions=" << list_versions <<
+    ", expansion_factor=" << expansion_factor <<
+    ", force_check_filter is " <<
+    (force_check_filter ? "set" : "unset") << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  m.clear();
+
+  RGWSI_RADOS::Pool index_pool;
+  // key   - oid (for different shards if there is any)
+  // value - list result for the corresponding oid (shard), it is filled by
+  //         the AIO callback
+  std::map<int, std::string> shard_oids;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout,
+                                         &index_pool, &shard_oids,
+                                         nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
+    return r;
+  }
+
+  const uint32_t shard_count = shard_oids.size();
+  if (shard_count == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": the bucket index shard count appears to be 0, "
+      "which is an illegal value" << dendl;
+    return -ERR_INVALID_BUCKET_STATE;
+  }
+
+  uint32_t num_entries_per_shard;
+  if (expansion_factor == 0) {
+    num_entries_per_shard =
+      calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+  } else if (expansion_factor <= 11) {
+    // we'll max out the exponential multiplication factor at 1024 (2<<10)
+    num_entries_per_shard =
+      std::min(num_entries,
+              (uint32_t(1 << (expansion_factor - 1)) *
+               calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
+  } else {
+    num_entries_per_shard = num_entries;
+  }
+
+  if (num_entries_per_shard == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": unable to calculate the number of entries to read from each "
+      "bucket index shard" << dendl;
+    return -ERR_INVALID_BUCKET_STATE;
+  }
+
+  ldpp_dout(dpp, 10) << __func__ <<
+    ": request from each of " << shard_count <<
+    " shard(s) for " << num_entries_per_shard << " entries to get " <<
+    num_entries << " total entries" << dendl;
+
+  auto& ioctx = index_pool.ioctx();
+  std::map<int, rgw_cls_list_ret> shard_list_results;
+  cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
+  r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
+                           num_entries_per_shard,
+                           list_versions, shard_oids, shard_list_results,
+                           cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
+      " failed" << dendl;
+    return r;
+  }
+
+  // to manage the iterators through each shard's list results
+  struct ShardTracker {
+    const size_t shard_idx;
+    rgw_cls_list_ret& result;
+    const std::string& oid_name;
+    RGWRados::ent_map_t::iterator cursor;
+    RGWRados::ent_map_t::iterator end;
+
+    // manages an iterator through a shard and provides other
+    // accessors
+    ShardTracker(size_t _shard_idx,
+                rgw_cls_list_ret& _result,
+                const std::string& _oid_name):
+      shard_idx(_shard_idx),
+      result(_result),
+      oid_name(_oid_name),
+      cursor(_result.dir.m.begin()),
+      end(_result.dir.m.end())
+    {}
+
+    inline const std::string& entry_name() const {
+      return cursor->first;
+    }
+    rgw_bucket_dir_entry& dir_entry() const {
+      return cursor->second;
+    }
+    inline bool is_truncated() const {
+      return result.is_truncated;
+    }
+    inline ShardTracker& advance() {
+      ++cursor;
+      // return a self-reference to allow for chaining of calls, such
+      // as x.advance().at_end()
+      return *this;
+    }
+    inline bool at_end() const {
+      return cursor == end;
+    }
+  }; // ShardTracker
+
+  // add the next unique candidate, or return false if we reach the end
+  auto next_candidate = [] (CephContext *cct, ShardTracker& t,
+                            std::multimap<std::string, size_t>& candidates,
+                            size_t tracker_idx) {
+    if (!t.at_end()) {
+      candidates.emplace(t.entry_name(), tracker_idx);
+    }
+    return;
+  };
+
+  // one tracker per shard requested (may not be all shards)
+  std::vector<ShardTracker> results_trackers;
+  results_trackers.reserve(shard_list_results.size());
+  for (auto& r : shard_list_results) {
+    results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
+
+    // if any *one* shard's result is trucated, the entire result is
+    // truncated
+    *is_truncated = *is_truncated || r.second.is_truncated;
+
+    // unless *all* are shards are cls_filtered, the entire result is
+    // not filtered
+    *cls_filtered = *cls_filtered && r.second.cls_filtered;
+  }
+
+  // create a map to track the next candidate entry from ShardTracker
+  // (key=candidate, value=index into results_trackers); as we consume
+  // entries from shards, we replace them with the next entries in the
+  // shards until we run out
+  std::multimap<std::string, size_t> candidates;
+  size_t tracker_idx = 0;
+  std::vector<size_t> vidx;
+  vidx.reserve(shard_list_results.size());
+  for (auto& t : results_trackers) {
+    // it's important that the values in the map refer to the index
+    // into the results_trackers vector, which may not be the same
+    // as the shard number (i.e., when not all shards are requested)
+    next_candidate(cct, t, candidates, tracker_idx);
+    ++tracker_idx;
+  }
+
+  rgw_bucket_dir_entry*
+    last_entry_visited = nullptr; // to set last_entry (marker)
+  std::map<std::string, bufferlist> updates;
+  uint32_t count = 0;
+  while (count < num_entries && !candidates.empty()) {
+    r = 0;
+    // select the next entry in lexical order (first key in map);
+    // again tracker_idx is not necessarily shard number, but is index
+    // into results_trackers vector
+    tracker_idx = candidates.begin()->second;
+    auto& tracker = results_trackers.at(tracker_idx);
+
+    const std::string& name = tracker.entry_name();
+    rgw_bucket_dir_entry& dirent = tracker.dir_entry();
+
+    ldpp_dout(dpp, 20) << __func__ << ": currently processing " <<
+      dirent.key << " from shard " << tracker.shard_idx << dendl;
+
+    const bool force_check =
+      force_check_filter && force_check_filter(dirent.key.name);
+
+    if ((!dirent.exists &&
+        !dirent.is_delete_marker() &&
+        !dirent.is_common_prefix()) ||
+        !dirent.pending_map.empty() ||
+        force_check) {
+      /* there are uncommitted ops. We need to check the current
+       * state, and if the tags are old we need to do clean-up as
+       * well. */
+      librados::IoCtx sub_ctx;
+      sub_ctx.dup(ioctx);
+      ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+       " calling check_disk_state bucket=" << bucket_info.bucket <<
+       " entry=" << dirent.key << dendl_bitx;
+      r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
+                          updates[tracker.oid_name], y);
+      if (r < 0 && r != -ENOENT) {
+       ldpp_dout(dpp, 0) << __func__ <<
+         ": check_disk_state for \"" << dirent.key <<
+         "\" failed with r=" << r << dendl;
+       return r;
+      }
+    } else {
+      r = 0;
+    }
+
+    // at this point either r >= 0 or r == -ENOENT
+    if (r >= 0) { // i.e., if r != -ENOENT
+      ldpp_dout(dpp, 10) << __func__ << ": got " <<
+       dirent.key << dendl;
+
+      auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
+      last_entry_visited = &it->second;
+      if (inserted) {
+       ++count;
+      } else {
+       ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+         " reassigned map value at \"" << name <<
+         "\", which should not happen" << dendl;
+      }
+    } else {
+      ldpp_dout(dpp, 10) << __func__ << ": skipping " <<
+       dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
+      last_entry_visited = &tracker.dir_entry();
+    }
+
+    // refresh the candidates map
+    vidx.clear();
+    bool need_to_stop = false;
+    auto range = candidates.equal_range(name);
+    for (auto i = range.first; i != range.second; ++i) {
+      vidx.push_back(i->second);
+    } 
+    candidates.erase(range.first, range.second);
+    for (auto idx : vidx) {
+      auto& tracker_match = results_trackers.at(idx);
+      tracker_match.advance();
+      next_candidate(cct, tracker_match, candidates, idx);
+      if (tracker_match.at_end() && tracker_match.is_truncated()) {
+        need_to_stop = true;
+        break;
+      }
+    }
+    if (need_to_stop) {
+      // once we exhaust one shard that is truncated, we need to stop,
+      // as we cannot be certain that one of the next entries needs to
+      // come from that shard; S3 and swift protocols allow returning
+      // fewer than what was requested
+      ldpp_dout(dpp, 10) << __func__ <<
+       ": stopped accumulating results at count=" << count <<
+       ", dirent=\"" << dirent.key <<
+       "\", because its shard is truncated and exhausted" << dendl;
+      break;
+    }
+  } // while we haven't provided requested # of result entries
+
+  // suggest updates if there are any
+  for (auto& miter : updates) {
+    if (miter.second.length()) {
+      ObjectWriteOperation o;
+      cls_rgw_suggest_changes(o, miter.second);
+      // we don't care if we lose suggested updates, send them off blindly
+      AioCompletion *c =
+       librados::Rados::aio_create_completion(nullptr, nullptr);
+
+      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+       ": doing dir_suggest on " << miter.first << dendl_bitx;
+      ioctx.aio_operate(miter.first, c, &o);
+      c->release();
+    }
+  } // updates loop
+
+  // determine truncation by checking if all the returned entries are
+  // consumed or not
+  *is_truncated = false;
+  for (const auto& t : results_trackers) {
+    if (!t.at_end() || t.is_truncated()) {
+      *is_truncated = true;
+      break;
+    }
+  }
+
+  ldpp_dout(dpp, 20) << __func__ <<
+    ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
+    dendl;
+
+  if (*is_truncated && count < num_entries) {
+    ldpp_dout(dpp, 10) << __func__ <<
+      ": requested " << num_entries << " entries but returning " <<
+      count << ", which is truncated" << dendl;
+  }
+
+  if (last_entry_visited != nullptr && last_entry) {
+    *last_entry = last_entry_visited->key;
+    ldpp_dout(dpp, 20) << __func__ <<
+      ": returning, last_entry=" << *last_entry << dendl;
+  } else {
+    ldpp_dout(dpp, 20) << __func__ <<
+      ": returning, last_entry NOT SET" << dendl;
+  }
+
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+  return 0;
+} // RGWRados::cls_bucket_list_ordered
+
+
+// A helper function to retrieve the hash source from an incomplete
+// multipart entry by removing everything from the second to last
+// period on.
+static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
+  std::size_t found = oid_wo_ns.rfind('.');
+  if (found == std::string::npos || found < 1) {
+    return -EINVAL;
+  }
+  found = oid_wo_ns.rfind('.', found - 1);
+  if (found == std::string::npos || found < 1) {
+    return -EINVAL;
+  }
+  *index_hash_source = oid_wo_ns.substr(0, found);
+  return 0;
+}
+
+
+int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+                                        RGWBucketInfo& bucket_info,
+                                        const rgw::bucket_index_layout_generation& idx_layout,
+                                        int shard_id,
+                                       const rgw_obj_index_key& start_after,
+                                       const std::string& prefix,
+                                       uint32_t num_entries,
+                                       bool list_versions,
+                                       std::vector<rgw_bucket_dir_entry>& ent_list,
+                                       bool *is_truncated,
+                                       rgw_obj_index_key *last_entry,
+                                        optional_yield y,
+                                       RGWBucketListNameFilter force_check_filter) {
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+    " start_after=\"" << start_after <<
+    "\", prefix=\"" << prefix <<
+    "\", shard_id=" << shard_id <<
+    "\", num_entries=" << num_entries <<
+    ", list_versions=" << list_versions <<
+    (force_check_filter ? "set" : "unset") << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  ent_list.clear();
+  static MultipartMetaFilter multipart_meta_filter;
+
+  *is_truncated = false;
+  RGWSI_RADOS::Pool index_pool;
+
+  std::map<int, std::string> oids;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  auto& ioctx = index_pool.ioctx();
+
+  const uint32_t num_shards = oids.size();
+
+  rgw_obj_index_key marker = start_after;
+  uint32_t current_shard;
+  if (shard_id >= 0) {
+    current_shard = shard_id;
+  } else if (start_after.empty()) {
+    current_shard = 0u;
+  } else {
+    // at this point we have a marker (start_after) that has something
+    // in it, so we need to get to the bucket shard index, so we can
+    // start reading from there
+
+
+    // now convert the key (oid) to an rgw_obj_key since that will
+    // separate out the namespace, name, and instance
+    rgw_obj_key obj_key;
+    bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
+    if (!parsed) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+       " received an invalid start marker: \"" << start_after << "\"" <<
+       dendl;
+      return -EINVAL;
+    } else if (obj_key.name.empty()) {
+      // if the name is empty that means the object name came in with
+      // a namespace only, and therefore we need to start our scan at
+      // the first bucket index shard
+      current_shard = 0u;
+    } else {
+      // so now we have the key used to compute the bucket index shard
+      // and can extract the specific shard from it
+      if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
+        // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
+        // the implementation relying on MultipartMetaFilter
+        // because MultipartMetaFilter only checks .meta suffix, which may
+        // exclude data multiparts but include some regular objects with .meta suffix
+        // by mistake.
+        string index_hash_source;
+        r = parse_index_hash_source(obj_key.name, &index_hash_source);
+        if (r < 0) {
+         ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+           " parse_index_hash_source unable to parse \"" << obj_key.name <<
+           "\", r=" << r << dendl;
+          return r;
+        }
+        current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
+      } else {
+        current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
+      }
+    }
+  }
+
+  uint32_t count = 0u;
+  std::map<std::string, bufferlist> updates;
+  rgw_obj_index_key last_added_entry;
+  while (count <= num_entries &&
+        ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
+         current_shard < num_shards)) {
+    const std::string& oid = oids[current_shard];
+    rgw_cls_list_ret result;
+
+    librados::ObjectReadOperation op;
+    const std::string empty_delimiter;
+    cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
+                          num_entries,
+                           list_versions, &result);
+    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+       ": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
+      return r;
+    }
+
+    for (auto& entry : result.dir.m) {
+      rgw_bucket_dir_entry& dirent = entry.second;
+
+      bool force_check = force_check_filter &&
+       force_check_filter(dirent.key.name);
+      if ((!dirent.exists && !dirent.is_delete_marker()) ||
+         !dirent.pending_map.empty() ||
+         force_check) {
+       /* there are uncommitted ops. We need to check the current state,
+        * and if the tags are old we need to do cleanup as well. */
+       librados::IoCtx sub_ctx;
+       sub_ctx.dup(ioctx);
+       ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+         ": calling check_disk_state bucket=" << bucket_info.bucket <<
+         " entry=" << dirent.key << dendl_bitx;
+       r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
+       if (r < 0 && r != -ENOENT) {
+         ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+           ": error in check_disk_state, r=" << r << dendl;
+         return r;
+       }
+      } else {
+        r = 0;
+      }
+
+      // at this point either r >= 0 or r == -ENOENT
+      if (r >= 0) { // i.e., if r != -ENOENT
+       ldpp_dout(dpp, 10) << __func__ << ": got " <<
+         dirent.key << dendl;
+
+       if (count < num_entries) {
+         marker = last_added_entry = dirent.key; // double assign
+         ent_list.emplace_back(std::move(dirent));
+         ++count;
+       } else {
+         last_added_entry = dirent.key;
+         *is_truncated = true;
+         ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
+           ": reached max entries (" << num_entries << ") to return at \"" <<
+           dirent.key << "\"" << dendl;
+         goto check_updates;
+       }
+      } else { // r == -ENOENT
+       // in the case of -ENOENT, make sure we're advancing marker
+       // for possible next call to CLSRGWIssueBucketList
+       marker = dirent.key;
+      }
+    } // entry for loop
+
+    if (!result.is_truncated) {
+      // if we reached the end of the shard read next shard
+      ++current_shard;
+      marker = rgw_obj_index_key();
+    }
+  } // shard loop
+
+check_updates:
+
+  // suggest updates if there is any
+  std::map<std::string, bufferlist>::iterator miter = updates.begin();
+  for (; miter != updates.end(); ++miter) {
+    if (miter->second.length()) {
+      ObjectWriteOperation o;
+      cls_rgw_suggest_changes(o, miter->second);
+      // we don't care if we lose suggested updates, send them off blindly
+      AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+
+      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+       " doing dir_suggest on " << miter->first << dendl_bitx;
+      ioctx.aio_operate(miter->first, c, &o);
+      c->release();
+    }
+  }
+
+  if (last_entry && !ent_list.empty()) {
+    *last_entry = last_added_entry;
+  }
+
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+  return 0;
+} // RGWRados::cls_bucket_list_unordered
+
+
+int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
+                                   rgw_usage_log_info& info)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  ObjectWriteOperation op;
+  cls_rgw_usage_log_add(op, info);
+
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  return r;
+}
+
+int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+                                     uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+                                     string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
+                                    bool *is_truncated)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  *is_truncated = false;
+
+  r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
+                            max_entries, read_iter, usage, is_truncated);
+
+  return r;
+}
+
+static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
+{
+  bool done = false;
+  do {
+    librados::ObjectWriteOperation op;
+    cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
+    int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+    if (r == -ENODATA)
+      done = true;
+    else if (r < 0)
+      return r;
+  } while (!done);
+
+  return 0;
+}
+
+int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+                                    uint64_t start_epoch, uint64_t end_epoch)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
+  return r;
+}
+
+int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  librados::ObjectWriteOperation op;
+  cls_rgw_usage_log_clear(op);
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  return r;
+}
+
+
+// note: this removes entries from the rados bucket index objects
+// without going through CLS; this is known to be called from
+// "radosgw-admin unlink" and "radosgw-admin bucket check --fix"
+int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
+                                    RGWBucketInfo& bucket_info,
+                                    const std::list<rgw_obj_index_key>& entry_key_list)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket <<
+    " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  const auto& current_index = bucket_info.get_current_index();
+  if (is_layout_indexless(current_index)) {
+    return -EINVAL;
+  }
+  const uint32_t num_shards = current_index.layout.normal.num_shards;
+
+  RGWSI_RADOS::Pool index_pool;
+  std::map<int, std::string> index_oids;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
+                                         bucket_info.layout.current_index,
+                                         &index_pool, &index_oids, nullptr);
+  if (r < 0) {
+    ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+      " open_bucket_index returned " << r << dendl_bitx;
+    return r;
+  }
+
+  // split up removals by shard
+  std::map<int, std::set<std::string>> sharded_removals;
+  for (const auto& entry_key : entry_key_list) {
+    const rgw_obj_key obj_key(entry_key);
+    const uint32_t shard =
+      RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+
+    // entry_key already combines namespace and name, so we first have
+    // to break that apart before we can then combine with instance
+    std::string name;
+    std::string ns; // namespace
+    rgw_obj_key::parse_index_key(entry_key.name, &name, &ns);
+    rgw_obj_key full_key(name, entry_key.instance, ns);
+    std::string combined_key = full_key.get_oid();
+
+    sharded_removals[shard].insert(combined_key);
+
+    ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+      ": removal from bucket index, bucket=" << bucket_info.bucket <<
+      " key=" << combined_key << " designated for shard " << shard <<
+      dendl_bitx;
+  }
+
+  for (const auto& removals : sharded_removals) {
+    const int shard = removals.first;
+    const std::string& oid = index_oids[shard];
+
+    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+      ": removal from bucket index, bucket=" << bucket_info.bucket <<
+      ", shard=" << shard << ", oid=" << oid << ", num_keys=" <<
+      removals.second.size() << dendl_bitx;
+
+    r = index_pool.ioctx().omap_rm_keys(oid, removals.second);
+    if (r < 0) {
+      ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+       ": omap_rm_keys returned ret=" << r <<
+       dendl_bitx;
+      return r;
+    }
+  }
+
+  ldout_bitx(bitx, dpp, 5) <<
+    "EXITING " << __func__ << " and returning " << r << dendl_bitx;
+
+  return r;
+}
+
+int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
+                               librados::IoCtx io_ctx,
+                               RGWBucketInfo& bucket_info,
+                               rgw_bucket_dir_entry& list_state,
+                               rgw_bucket_dir_entry& object,
+                               bufferlist& suggested_updates,
+                               optional_yield y)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" <<
+    bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx;
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  driver->get_bucket(nullptr, bucket_info, &bucket);
+  uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
+
+  std::string loc;
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(list_state.key);
+  MultipartMetaFilter multipart_meta_filter;
+  string temp_key;
+  if (multipart_meta_filter.filter(list_state.key.name, temp_key)) {
+    obj->set_in_extra_data(true);
+  }
+
+  string oid;
+  get_obj_bucket_and_oid_loc(obj->get_obj(), oid, loc);
+
+  if (loc != list_state.locator) {
+    ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
+  }
+
+  io_ctx.locator_set_key(list_state.locator);
+
+  RGWObjState *astate = NULL;
+  RGWObjManifest *manifest = nullptr;
+  RGWObjectCtx rctx(this->driver);
+  int r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  list_state.pending_map.clear(); // we don't need this and it inflates size
+  if (!list_state.is_delete_marker() && !astate->exists) {
+    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx;
+      /* object doesn't exist right now -- hopefully because it's
+       * marked as !exists and got deleted */
+    if (list_state.exists) {
+      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx;
+      /* FIXME: what should happen now? Work out if there are any
+       * non-bad ways this could happen (there probably are, but annoying
+       * to handle!) */
+    }
+
+    // encode a suggested removal of that key
+    list_state.ver.epoch = io_ctx.get_last_version();
+    list_state.ver.pool = io_ctx.get_id();
+    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
+    cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
+    return -ENOENT;
+  }
+
+  string etag;
+  string content_type;
+  string storage_class;
+  ACLOwner owner;
+  bool appendable = false;
+
+  object.meta.size = astate->size;
+  object.meta.accounted_size = astate->accounted_size;
+  object.meta.mtime = astate->mtime;
+
+  map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
+  if (iter != astate->attrset.end()) {
+    etag = rgw_bl_str(iter->second);
+  }
+  iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
+  if (iter != astate->attrset.end()) {
+    content_type = rgw_bl_str(iter->second);
+  }
+  iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+  if (iter != astate->attrset.end()) {
+    storage_class = rgw_bl_str(iter->second);
+  }
+  iter = astate->attrset.find(RGW_ATTR_ACL);
+  if (iter != astate->attrset.end()) {
+    r = decode_policy(dpp, iter->second, &owner);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
+    }
+  }
+  iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+  if (iter != astate->attrset.end()) {
+    appendable = true;
+  }
+
+  if (manifest) {
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+      const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(driver);
+      rgw_obj loc;
+      RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc);
+
+      if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
+       ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx;
+       r = delete_obj_index(loc, astate->mtime, dpp);
+       if (r < 0) {
+         ldout_bitx(bitx, dpp, 0) <<
+           "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx;
+       }
+      }
+    }
+  }
+
+  object.meta.etag = etag;
+  object.meta.content_type = content_type;
+  object.meta.storage_class = storage_class;
+  object.meta.owner = owner.get_id().to_str();
+  object.meta.owner_display_name = owner.get_display_name();
+  object.meta.appendable = appendable;
+
+  // encode suggested updates
+
+  list_state.meta.size = object.meta.size;
+  list_state.meta.accounted_size = object.meta.accounted_size;
+  list_state.meta.mtime = object.meta.mtime;
+  list_state.meta.category = main_category;
+  list_state.meta.etag = etag;
+  list_state.meta.appendable = appendable;
+  list_state.meta.content_type = content_type;
+  list_state.meta.storage_class = storage_class;
+
+  librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
+  r = get_obj_head_ioctx(dpp, bucket_info, obj->get_obj(), &head_obj_ctx);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      " WARNING: unable to find head object data pool for \"" <<
+      obj << "\", not updating version pool/epoch" << dendl;
+  } else {
+    list_state.ver.pool = head_obj_ctx.get_id();
+    list_state.ver.epoch = astate->epoch;
+  }
+
+  if (astate->obj_tag.length() > 0) {
+    list_state.tag = astate->obj_tag.c_str();
+  }
+
+  list_state.meta.owner = owner.get_id().to_str();
+  list_state.meta.owner_display_name = owner.get_display_name();
+
+  list_state.exists = true;
+
+  ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+    ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx;
+  cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
+
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+  return 0;
+} // RGWRados::check_disk_state
+
+int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> oids;
+  map<int, struct rgw_cls_list_ret> list_results;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
+                   << r << dendl;
+    return r;
+  }
+
+  r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
+                   << r << dendl;
+    return r;
+  }
+
+  map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
+  for(; iter != list_results.end(); ++iter) {
+    headers.push_back(std::move(iter->second.dir.header));
+  }
+  return 0;
+}
+
+int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+
+  map<int, string>::iterator iter = bucket_objs.begin();
+  for (; iter != bucket_objs.end(); ++iter) {
+    r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
+    if (r < 0) {
+      ctx->put();
+      break;
+    } else {
+      (*num_aio)++;
+    }
+  }
+  return r;
+}
+
+int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
+                                 const rgw_bucket& bucket,
+                                 uint64_t num_objs,
+                                  const DoutPrefixProvider *dpp)
+{
+  if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
+      return 0;
+  }
+
+  bool need_resharding = false;
+  uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+  const uint32_t max_dynamic_shards =
+    uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
+
+  if (num_source_shards >= max_dynamic_shards) {
+    return 0;
+  }
+
+  uint32_t suggested_num_shards = 0;
+  const uint64_t max_objs_per_shard =
+    cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
+
+  // TODO: consider per-bucket sync policy here?
+  const bool is_multisite = svc.zone->get_zone().log_data;
+
+  quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
+                                    num_objs, is_multisite, need_resharding,
+                                    &suggested_num_shards);
+  if (! need_resharding) {
+    return 0;
+  }
+
+  const uint32_t final_num_shards =
+    RGWBucketReshard::get_preferred_shards(suggested_num_shards,
+                                          max_dynamic_shards);
+  // final verification, so we don't reduce number of shards
+  if (final_num_shards <= num_source_shards) {
+    return 0;
+  }
+
+  ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
+    " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
+    "; new num shards " << final_num_shards << " (suggested " <<
+    suggested_num_shards << ")" << dendl;
+
+  return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
+}
+
+int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
+{
+  RGWReshard reshard(this->driver, dpp);
+
+  uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+
+  new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
+  if (new_num_shards <= num_source_shards) {
+    ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
+    return 0;
+  }
+
+  cls_rgw_reshard_entry entry;
+  entry.time = real_clock::now();
+  entry.tenant = bucket_info.owner.tenant;
+  entry.bucket_name = bucket_info.bucket.name;
+  entry.bucket_id = bucket_info.bucket.bucket_id;
+  entry.old_num_shards = num_source_shards;
+  entry.new_num_shards = new_num_shards;
+
+  return reshard.add(dpp, entry);
+}
+
+int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+                          RGWQuota& quota,
+                         uint64_t obj_size, optional_yield y,
+                         bool check_size_only)
+{
+  // if we only check size, then num_objs will set to 0
+  if(check_size_only)
+    return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y);
+
+  return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y);
+}
+
+int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
+                                  int *shard_id)
+{
+  int r = 0;
+  switch (layout.hash_type) {
+    case rgw::BucketHashType::Mod:
+      if (!layout.num_shards) {
+        if (shard_id) {
+          *shard_id = -1;
+        }
+      } else {
+        uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
+        if (shard_id) {
+          *shard_id = (int)sid;
+        }
+      }
+      break;
+    default:
+      r = -ENOTSUP;
+  }
+  return r;
+}
+
+uint64_t RGWRados::instance_id()
+{
+  return get_rados_handle()->get_instance_id();
+}
+
+uint64_t RGWRados::next_bucket_id()
+{
+  std::lock_guard l{bucket_id_lock};
+  return ++max_bucket_id;
+}
+
+librados::Rados* RGWRados::get_rados_handle()
+{
+  return &rados;
+}
+
+int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
+{
+  rgw_rados_ref ref;
+  int ret = get_raw_obj_ref(dpp, obj, &ref);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+    return ret;
+  }
+
+  ObjectWriteOperation op;
+  list<string> prefixes;
+  cls_rgw_remove_obj(op, prefixes);
+
+  AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+    c->release();
+    return ret;
+  }
+
+  handles.push_back(c);
+
+  return 0;
+}
+
+int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
+                             RGWBucketInfo& bucket_info, RGWObjState *astate,
+                             list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+                             optional_yield y)
+{
+  rgw_rados_ref ref;
+  int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (keep_index_consistent) {
+    RGWRados::Bucket bop(this, bucket_info);
+    RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+    ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
+      return ret;
+    }
+  }
+
+  ObjectWriteOperation op;
+  list<string> prefixes;
+  cls_rgw_remove_obj(op, prefixes);
+
+  AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+    c->release();
+    return ret;
+  }
+
+  handles.push_back(c);
+
+  if (keep_index_consistent) {
+    ret = delete_obj_index(obj, astate->mtime, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
+      return ret;
+    }
+  }
+  return ret;
+}
+
+void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
+{
+  auto it = new objexp_hint_entry;
+  it->tenant = "tenant1";
+  it->bucket_name = "bucket1";
+  it->bucket_id = "1234";
+  it->obj_key = rgw_obj_key("obj");
+  o.push_back(it);
+  o.push_back(new objexp_hint_entry);
+}
+
+void objexp_hint_entry::dump(Formatter *f) const
+{
+  f->open_object_section("objexp_hint_entry");
+  encode_json("tenant", tenant, f);
+  encode_json("bucket_name", bucket_name, f);
+  encode_json("bucket_id", bucket_id, f);
+  encode_json("rgw_obj_key", obj_key, f);
+  utime_t ut(exp_time);
+  encode_json("exp_time", ut, f);
+  f->close_section();
+}
+
+void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
+{
+  RGWOLHInfo *olh = new RGWOLHInfo;
+  olh->removed = false;
+  o.push_back(olh);
+  o.push_back(new RGWOLHInfo);
+}
+
+void RGWOLHInfo::dump(Formatter *f) const
+{
+  encode_json("target", target, f);
+}
+
+void RGWOLHPendingInfo::dump(Formatter *f) const
+{
+  utime_t ut(time);
+  encode_json("time", ut, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h

new file mode 100644 (file)

index 0000000..a3258ac
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -0,0 +1,1632 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <iostream>
+#include <functional>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "common/Timer.h"
+#include "rgw_common.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "cls/timeindex/cls_timeindex_types.h"
+#include "cls/otp/cls_otp_types.h"
+#include "rgw_quota.h"
+#include "rgw_log.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_period_puller.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_sync_module.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_service.h"
+#include "rgw_sal.h"
+#include "rgw_aio.h"
+#include "rgw_d3n_cacherequest.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_bi_rados.h"
+#include "common/Throttle.h"
+#include "common/ceph_mutex.h"
+#include "rgw_cache.h"
+#include "rgw_sal_fwd.h"
+
+struct D3nDataCache;
+
+class RGWWatcher;
+class ACLOwner;
+class RGWGC;
+class RGWMetaNotifier;
+class RGWDataNotifier;
+class RGWLC;
+class RGWObjectExpirer;
+class RGWMetaSyncProcessorThread;
+class RGWDataSyncProcessorThread;
+class RGWSyncLogTrimThread;
+class RGWSyncTraceManager;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWReshard;
+class RGWReshardWait;
+
+struct get_obj_data;
+
+/* flags for put_obj_meta() */
+#define PUT_OBJ_CREATE      0x01
+#define PUT_OBJ_EXCL        0x02
+#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL)
+
+static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid)
+{
+  if (bucket.marker.empty() || orig_oid.empty()) {
+    oid = orig_oid;
+  } else {
+    oid = bucket.marker;
+    oid.append("_");
+    oid.append(orig_oid);
+  }
+}
+
+static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator)
+{
+  const rgw_bucket& bucket = obj.bucket;
+  prepend_bucket_marker(bucket, obj.get_oid(), oid);
+  const std::string& loc = obj.key.get_loc();
+  if (!loc.empty()) {
+    prepend_bucket_marker(bucket, loc, locator);
+  } else {
+    locator.clear();
+  }
+}
+
+struct RGWOLHInfo {
+  rgw_obj target;
+  bool removed;
+
+  RGWOLHInfo() : removed(false) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(target, bl);
+    encode(removed, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(target, bl);
+     decode(removed, bl);
+     DECODE_FINISH(bl);
+  }
+  static void generate_test_instances(std::list<RGWOLHInfo*>& o);
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHInfo)
+
+struct RGWOLHPendingInfo {
+  ceph::real_time time;
+
+  RGWOLHPendingInfo() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(time, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(time, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHPendingInfo)
+
+struct RGWUsageBatch {
+  std::map<ceph::real_time, rgw_usage_log_entry> m;
+
+  void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) {
+    bool exists = m.find(t) != m.end();
+    *account = !exists;
+    m[t].aggregate(entry);
+  }
+};
+
+struct RGWCloneRangeInfo {
+  rgw_obj src;
+  off_t src_ofs;
+  off_t dst_ofs;
+  uint64_t len;
+};
+
+class RGWFetchObjFilter {
+public:
+  virtual ~RGWFetchObjFilter() {}
+
+  virtual int filter(CephContext *cct,
+                     const rgw_obj_key& source_key,
+                     const RGWBucketInfo& dest_bucket_info,
+                     std::optional<rgw_placement_rule> dest_placement_rule,
+                     const std::map<std::string, bufferlist>& obj_attrs,
+                     std::optional<rgw_user> *poverride_owner,
+                     const rgw_placement_rule **prule) = 0;
+};
+
+class RGWFetchObjFilter_Default : public RGWFetchObjFilter {
+protected:
+  rgw_placement_rule dest_rule;
+public:
+  RGWFetchObjFilter_Default() {}
+
+  int filter(CephContext *cct,
+             const rgw_obj_key& source_key,
+             const RGWBucketInfo& dest_bucket_info,
+             std::optional<rgw_placement_rule> dest_placement_rule,
+             const std::map<std::string, bufferlist>& obj_attrs,
+             std::optional<rgw_user> *poverride_owner,
+             const rgw_placement_rule **prule) override;
+};
+
+struct RGWObjStateManifest {
+  RGWObjState state;
+  std::optional<RGWObjManifest> manifest;
+};
+
+class RGWObjectCtx {
+  rgw::sal::Driver* driver;
+  ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx");
+
+  std::map<rgw_obj, RGWObjStateManifest> objs_state;
+public:
+  explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {}
+  RGWObjectCtx(RGWObjectCtx& _o) {
+    std::unique_lock wl{lock};
+    this->driver = _o.driver;
+    this->objs_state = _o.objs_state;
+  }
+
+  rgw::sal::Driver* get_driver() {
+    return driver;
+  }
+
+  RGWObjStateManifest *get_state(const rgw_obj& obj);
+
+  void set_compressed(const rgw_obj& obj);
+  void set_atomic(rgw_obj& obj);
+  void set_prefetch_data(const rgw_obj& obj);
+  void invalidate(const rgw_obj& obj);
+};
+
+
+struct RGWRawObjState {
+  rgw_raw_obj obj;
+  bool has_attrs{false};
+  bool exists{false};
+  uint64_t size{0};
+  ceph::real_time mtime;
+  uint64_t epoch{0};
+  bufferlist obj_tag;
+  bool has_data{false};
+  bufferlist data;
+  bool prefetch_data{false};
+  uint64_t pg_ver{0};
+
+  /* important! don't forget to update copy constructor */
+
+  RGWObjVersionTracker objv_tracker;
+
+  std::map<std::string, bufferlist> attrset;
+  RGWRawObjState() {}
+  RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) {
+    has_attrs = rhs.has_attrs;
+    exists = rhs.exists;
+    size = rhs.size;
+    mtime = rhs.mtime;
+    epoch = rhs.epoch;
+    if (rhs.obj_tag.length()) {
+      obj_tag = rhs.obj_tag;
+    }
+    has_data = rhs.has_data;
+    if (rhs.data.length()) {
+      data = rhs.data;
+    }
+    prefetch_data = rhs.prefetch_data;
+    pg_ver = rhs.pg_ver;
+    objv_tracker = rhs.objv_tracker;
+  }
+};
+
+struct RGWPoolIterCtx {
+  librados::IoCtx io_ctx;
+  librados::NObjectIterator iter;
+};
+
+struct RGWListRawObjsCtx {
+  bool initialized;
+  RGWPoolIterCtx iter_ctx;
+
+  RGWListRawObjsCtx() : initialized(false) {}
+};
+
+struct objexp_hint_entry {
+  std::string tenant;
+  std::string bucket_name;
+  std::string bucket_id;
+  rgw_obj_key obj_key;
+  ceph::real_time exp_time;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(bucket_name, bl);
+    encode(bucket_id, bl);
+    encode(obj_key, bl);
+    encode(exp_time, bl);
+    encode(tenant, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
+    DECODE_START(2, bl);
+    decode(bucket_name, bl);
+    decode(bucket_id, bl);
+    decode(obj_key, bl);
+    decode(exp_time, bl);
+    if (struct_v >= 2) {
+      decode(tenant, bl);
+    } else {
+      tenant.clear();
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<objexp_hint_entry*>& o);
+};
+WRITE_CLASS_ENCODER(objexp_hint_entry)
+
+class RGWMetaSyncStatusManager;
+class RGWDataSyncStatusManager;
+class RGWCoroutinesManagerRegistry;
+
+class RGWGetDirHeader_CB;
+class RGWGetUserHeader_CB;
+namespace rgw { namespace sal {
+  class RadosStore;
+  class MPRadosSerializer;
+  class LCRadosSerializer;
+} }
+
+class RGWAsyncRadosProcessor;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+struct bucket_info_entry {
+  RGWBucketInfo info;
+  real_time mtime;
+  std::map<std::string, bufferlist> attrs;
+};
+
+struct tombstone_entry;
+
+template <class K, class V>
+class lru_map;
+using tombstone_cache_t = lru_map<rgw_obj, tombstone_entry>;
+
+class RGWIndexCompletionManager;
+
+class RGWRados
+{
+  friend class RGWGC;
+  friend class RGWMetaNotifier;
+  friend class RGWDataNotifier;
+  friend class RGWObjectExpirer;
+  friend class RGWMetaSyncProcessorThread;
+  friend class RGWDataSyncProcessorThread;
+  friend class RGWReshard;
+  friend class RGWBucketReshard;
+  friend class RGWBucketReshardLock;
+  friend class BucketIndexLockGuard;
+  friend class rgw::sal::MPRadosSerializer;
+  friend class rgw::sal::LCRadosSerializer;
+  friend class rgw::sal::RadosStore;
+
+  /** Open the pool used as root for this gateway */
+  int open_root_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_gc_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_lc_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_objexp_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_reshard_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_notif_pool_ctx(const DoutPrefixProvider *dpp);
+
+  int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx&  io_ctx,
+                   bool mostly_omap);
+
+
+  ceph::mutex lock = ceph::make_mutex("rados_timer_lock");
+  SafeTimer *timer;
+
+  rgw::sal::RadosStore* driver = nullptr;
+  RGWGC *gc = nullptr;
+  RGWLC *lc;
+  RGWObjectExpirer *obj_expirer;
+  bool use_gc_thread;
+  bool use_lc_thread;
+  bool quota_threads;
+  bool run_sync_thread;
+  bool run_reshard_thread;
+
+  RGWMetaNotifier *meta_notifier;
+  RGWDataNotifier *data_notifier;
+  RGWMetaSyncProcessorThread *meta_sync_processor_thread;
+  RGWSyncTraceManager *sync_tracer = nullptr;
+  std::map<rgw_zone_id, RGWDataSyncProcessorThread *> data_sync_processor_threads;
+
+  boost::optional<rgw::BucketTrimManager> bucket_trim;
+  RGWSyncLogTrimThread *sync_log_trimmer{nullptr};
+
+  ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock");
+  ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock");
+
+  librados::IoCtx root_pool_ctx;      // .rgw
+
+  double inject_notify_timeout_probability = 0;
+  unsigned max_notify_retries = 0;
+
+  friend class RGWWatcher;
+
+  ceph::mutex bucket_id_lock = ceph::make_mutex("rados_bucket_id");
+
+  // This field represents the number of bucket index object shards
+  uint32_t bucket_index_max_shards;
+
+  std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref);
+  int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref);
+  int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+  uint64_t max_bucket_id;
+
+  int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx,
+                          RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+                          RGWObjState *olh_state, RGWObjState **target_state,
+                          RGWObjManifest **target_manifest, optional_yield y);
+  int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
+                         bool follow_olh, optional_yield y, bool assume_noent = false);
+  int append_atomic_test(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+                         librados::ObjectOperation& op, RGWObjState **state,
+                        RGWObjManifest** pmanifest, optional_yield y);
+  
+  int update_placement_map();
+  int store_bucket_info(RGWBucketInfo& info, std::map<std::string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive);
+
+  void remove_rgw_head_obj(librados::ObjectWriteOperation& op);
+  void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist);
+  void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type);
+protected:
+  CephContext *cct;
+
+  librados::Rados rados;
+
+  using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl<bucket_info_entry>;
+  RGWChainedCacheImpl_bucket_info_entry *binfo_cache;
+
+  tombstone_cache_t *obj_tombstone_cache;
+
+  librados::IoCtx gc_pool_ctx;        // .rgw.gc
+  librados::IoCtx lc_pool_ctx;        // .rgw.lc
+  librados::IoCtx objexp_pool_ctx;
+  librados::IoCtx reshard_pool_ctx;
+  librados::IoCtx notif_pool_ctx;     // .rgw.notif
+
+  bool pools_initialized;
+
+  RGWQuotaHandler *quota_handler;
+
+  RGWCoroutinesManagerRegistry *cr_registry;
+
+  RGWSyncModuleInstanceRef sync_module;
+  bool writeable_zone{false};
+
+  RGWIndexCompletionManager *index_completion_manager{nullptr};
+
+  bool use_cache{false};
+  bool use_gc{true};
+  bool use_datacache{false};
+
+  int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx);
+public:
+  RGWRados(): timer(NULL),
+               gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
+               run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL),
+               data_notifier(NULL), meta_sync_processor_thread(NULL),
+               bucket_index_max_shards(0),
+               max_bucket_id(0), cct(NULL),
+               binfo_cache(NULL), obj_tombstone_cache(nullptr),
+               pools_initialized(false),
+               quota_handler(NULL),
+               cr_registry(NULL),
+               pctl(&ctl),
+               reshard(NULL) {}
+
+  RGWRados& set_use_cache(bool status) {
+    use_cache = status;
+    return *this;
+  }
+
+  RGWRados& set_use_gc(bool status) {
+    use_gc = status;
+    return *this;
+  }
+
+  RGWRados& set_use_datacache(bool status) {
+    use_datacache = status;
+    return *this;
+  }
+
+  bool get_use_datacache() {
+    return use_datacache;
+  }
+
+  RGWLC *get_lc() {
+    return lc;
+  }
+
+  RGWGC *get_gc() {
+    return gc;
+  }
+
+  RGWRados& set_run_gc_thread(bool _use_gc_thread) {
+    use_gc_thread = _use_gc_thread;
+    return *this;
+  }
+
+  RGWRados& set_run_lc_thread(bool _use_lc_thread) {
+    use_lc_thread = _use_lc_thread;
+    return *this;
+  }
+
+  RGWRados& set_run_quota_threads(bool _run_quota_threads) {
+    quota_threads = _run_quota_threads;
+    return *this;
+  }
+
+  RGWRados& set_run_sync_thread(bool _run_sync_thread) {
+    run_sync_thread = _run_sync_thread;
+    return *this;
+  }
+
+  RGWRados& set_run_reshard_thread(bool _run_reshard_thread) {
+    run_reshard_thread = _run_reshard_thread;
+    return *this;
+  }
+
+  librados::IoCtx* get_lc_pool_ctx() {
+    return &lc_pool_ctx;
+  }
+
+  librados::IoCtx& get_notif_pool_ctx() {
+    return notif_pool_ctx;
+  }
+
+  void set_context(CephContext *_cct) {
+    cct = _cct;
+  }
+  void set_store(rgw::sal::RadosStore* _driver) {
+    driver = _driver;
+  }
+
+  RGWServices svc;
+  RGWCtl ctl;
+
+  RGWCtl *pctl{nullptr};
+
+  /**
+   * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we
+   * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed.
+   */
+  std::string host_id;
+
+  RGWReshard *reshard;
+  std::shared_ptr<RGWReshardWait> reshard_wait;
+
+  virtual ~RGWRados() = default;
+
+  tombstone_cache_t *get_tombstone_cache() {
+    return obj_tombstone_cache;
+  }
+  const RGWSyncModuleInstanceRef& get_sync_module() {
+    return sync_module;
+  }
+  RGWSyncTraceManager *get_sync_tracer() {
+    return sync_tracer;
+  }
+
+  int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment);
+  void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size);
+  int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+  int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+
+  uint32_t get_max_bucket_shards() {
+    return RGWSI_BucketIndex_RADOS::shards_max();
+  }
+
+
+  int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+
+  int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx);
+  int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max,
+                            RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+                            bool *is_truncated);
+  int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max,
+                       RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+                       bool *is_truncated);
+  std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
+
+  CephContext *ctx() { return cct; }
+  /** do all necessary setup of the storage device */
+  int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) {
+    set_context(_cct);
+    return init_begin(dpp);
+  }
+  /** Initialize the RADOS instance and prepare to do other ops */
+  int init_svc(bool raw, const DoutPrefixProvider *dpp);
+  int init_ctl(const DoutPrefixProvider *dpp);
+  virtual int init_rados();
+  int init_begin(const DoutPrefixProvider *dpp);
+  int init_complete(const DoutPrefixProvider *dpp);
+  void finalize();
+
+  int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map<std::string, std::string>& meta);
+  int update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status);
+
+  /// list logs
+  int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle);
+  int log_list_next(RGWAccessHandle handle, std::string *name);
+
+  /// remove log
+  int log_remove(const DoutPrefixProvider *dpp, const std::string& name);
+
+  /// show log
+  int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle);
+  int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry);
+
+  // log bandwidth info
+  int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info);
+  int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+                 uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map<rgw_user_bucket,
+                rgw_usage_log_entry>& usage);
+  int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch);
+  int clear_usage(const DoutPrefixProvider *dpp);
+
+  int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool);
+
+  void create_bucket_id(std::string *bucket_id);
+
+  bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool);
+  bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj);
+
+  int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+                   const std::string& zonegroup_id,
+                   const rgw_placement_rule& placement_rule,
+                   const std::string& swift_ver_location,
+                   const RGWQuotaInfo * pquota_info,
+                   std::map<std::string,bufferlist>& attrs,
+                   RGWBucketInfo& bucket_info,
+                   obj_version *pobjv,
+                   obj_version *pep_objv,
+                   ceph::real_time creation_time,
+                   rgw_bucket *master_bucket,
+                   uint32_t *master_num_shards,
+                   optional_yield y,
+                    const DoutPrefixProvider *dpp,
+                   bool exclusive = true);
+
+  RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; }
+
+  struct BucketShard {
+    RGWRados *store;
+    rgw_bucket bucket;
+    int shard_id;
+    RGWSI_RADOS::Obj bucket_obj;
+
+    explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
+    int init(const rgw_bucket& _bucket, const rgw_obj& obj,
+             RGWBucketInfo* out, const DoutPrefixProvider *dpp);
+    int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
+    int init(const DoutPrefixProvider *dpp,
+            const RGWBucketInfo& bucket_info,
+            const rgw::bucket_index_layout_generation& index, int sid);
+
+    friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) {
+      out << "BucketShard:{ bucket=" << bs.bucket <<
+       ", shard_id=" << bs.shard_id <<
+       ", bucket_ojb=" << bs.bucket_obj << "}";
+      return out;
+    }
+  };
+
+  class Object {
+    RGWRados *store;
+    rgw::sal::Bucket* bucket;
+    RGWObjectCtx& ctx;
+    rgw::sal::Object* obj;
+
+    BucketShard bs;
+
+    RGWObjState *state;
+    RGWObjManifest *manifest;
+
+    bool versioning_disabled;
+
+    bool bs_initialized;
+
+    const rgw_placement_rule *pmeta_placement_rule;
+
+  protected:
+    int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false);
+    void invalidate_state();
+
+    int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag,
+                                    const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y);
+    int complete_atomic_modification(const DoutPrefixProvider *dpp);
+
+  public:
+    Object(RGWRados *_store, rgw::sal::Bucket* _bucket, RGWObjectCtx& _ctx, rgw::sal::Object* _obj) : store(_store), bucket(_bucket),
+                                                                                               ctx(_ctx), obj(_obj), bs(store),
+                                                                                               state(NULL), manifest(nullptr), versioning_disabled(false),
+                                                                                               bs_initialized(false),
+                                                                                               pmeta_placement_rule(nullptr) {}
+
+    RGWRados *get_store() { return store; }
+    rgw_obj get_obj() { return obj->get_obj(); }
+    RGWObjectCtx& get_ctx() { return ctx; }
+    RGWBucketInfo& get_bucket_info() { return bucket->get_info(); }
+    const std::string& get_instance() { return obj->get_instance(); }
+    rgw::sal::Object* get_target() { return obj; }
+    int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y);
+
+    int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+      if (!bs_initialized) {
+        int r =
+         bs.init(bucket->get_key(), obj->get_obj(), nullptr /* no RGWBucketInfo */, dpp);
+        if (r < 0) {
+          return r;
+        }
+        bs_initialized = true;
+      }
+      *pbs = &bs;
+      return 0;
+    }
+
+    void set_versioning_disabled(bool status) {
+      versioning_disabled = status;
+    }
+
+    bool versioning_enabled() {
+      return (!versioning_disabled && bucket->versioning_enabled());
+    }
+
+    void set_meta_placement_rule(const rgw_placement_rule *p) {
+        pmeta_placement_rule = p;
+    }
+
+    const rgw_placement_rule& get_meta_placement_rule() {
+        return pmeta_placement_rule ? *pmeta_placement_rule : bucket->get_placement_rule();
+    }
+
+    struct Read {
+      RGWRados::Object *source;
+
+      struct GetObjState {
+        std::map<rgw_pool, librados::IoCtx> io_ctxs;
+        rgw_pool cur_pool;
+        librados::IoCtx *cur_ioctx{nullptr};
+        rgw_obj obj;
+        rgw_raw_obj head_obj;
+      } state;
+      
+      struct ConditionParams {
+        const ceph::real_time *mod_ptr;
+        const ceph::real_time *unmod_ptr;
+        bool high_precision_time;
+        uint32_t mod_zone_id;
+        uint64_t mod_pg_ver;
+        const char *if_match;
+        const char *if_nomatch;
+        
+        ConditionParams() : 
+                 mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
+                 if_match(NULL), if_nomatch(NULL) {}
+      } conds;
+
+      struct Params {
+        ceph::real_time *lastmod;
+        uint64_t *obj_size;
+        std::map<std::string, bufferlist> *attrs;
+        rgw_obj *target_obj;
+
+        Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
+                target_obj(nullptr) {}
+      } params;
+
+      explicit Read(RGWRados::Object *_source) : source(_source) {}
+
+      int prepare(optional_yield y, const DoutPrefixProvider *dpp);
+      static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+      int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp);
+      int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y);
+      int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y);
+    };
+
+    struct Write {
+      RGWRados::Object *target;
+      
+      struct MetaParams {
+        ceph::real_time *mtime;
+        std::map<std::string, bufferlist>* rmattrs;
+        const bufferlist *data;
+        RGWObjManifest *manifest;
+        const std::string *ptag;
+        std::list<rgw_obj_index_key> *remove_objs;
+        ceph::real_time set_mtime;
+        rgw_user owner;
+        RGWObjCategory category;
+        int flags;
+        const char *if_match;
+        const char *if_nomatch;
+        std::optional<uint64_t> olh_epoch;
+        ceph::real_time delete_at;
+        bool canceled;
+        const std::string *user_data;
+        rgw_zone_set *zones_trace;
+        bool modify_tail;
+        bool completeMultipart;
+        bool appendable;
+
+        MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
+                 remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
+                 if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
+                 modify_tail(false),  completeMultipart(false), appendable(false) {}
+      } meta;
+
+      explicit Write(RGWRados::Object *_target) : target(_target) {}
+
+      int _do_write_meta(const DoutPrefixProvider *dpp,
+                     uint64_t size, uint64_t accounted_size,
+                     std::map<std::string, bufferlist>& attrs,
+                     bool modify_tail, bool assume_noent,
+                     void *index_op, optional_yield y);
+      int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+                     std::map<std::string, bufferlist>& attrs, optional_yield y);
+      int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive);
+      const req_state* get_req_state() {
+        return nullptr;  /* XXX dang Only used by LTTng, and it handles null anyway */
+      }
+    };
+
+    struct Delete {
+      RGWRados::Object *target;
+
+      struct DeleteParams {
+        rgw_user bucket_owner;
+        int versioning_status; // versioning flags defined in enum RGWBucketFlags
+        ACLOwner obj_owner;    // needed for creation of deletion marker
+        uint64_t olh_epoch;
+        std::string marker_version_id;
+        uint32_t bilog_flags;
+        std::list<rgw_obj_index_key> *remove_objs;
+        ceph::real_time expiration_time;
+        ceph::real_time unmod_since;
+        ceph::real_time mtime; /* for setting delete marker mtime */
+        bool high_precision_time;
+        rgw_zone_set *zones_trace;
+       bool abortmp;
+       uint64_t parts_accounted_size;
+
+        DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+      } params;
+
+      struct DeleteResult {
+        bool delete_marker;
+        std::string version_id;
+
+        DeleteResult() : delete_marker(false) {}
+      } result;
+      
+      explicit Delete(RGWRados::Object *_target) : target(_target) {}
+
+      int delete_obj(optional_yield y, const DoutPrefixProvider *dpp);
+    };
+
+    struct Stat {
+      RGWRados::Object *source;
+
+      struct Result {
+        rgw_obj obj;
+       std::optional<RGWObjManifest> manifest;
+        uint64_t size{0};
+       struct timespec mtime {};
+        std::map<std::string, bufferlist> attrs;
+      } result;
+
+      struct State {
+        librados::IoCtx io_ctx;
+        librados::AioCompletion *completion;
+        int ret;
+
+        State() : completion(NULL), ret(0) {}
+      } state;
+
+
+      explicit Stat(RGWRados::Object *_source) : source(_source) {}
+
+      int stat_async(const DoutPrefixProvider *dpp);
+      int wait(const DoutPrefixProvider *dpp);
+      int stat();
+    private:
+      int finish(const DoutPrefixProvider *dpp);
+    };
+  };
+
+  class Bucket {
+    RGWRados *store;
+    RGWBucketInfo bucket_info;
+    rgw_bucket& bucket;
+    int shard_id;
+
+  public:
+    Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket),
+                                                            shard_id(RGW_NO_SHARD) {}
+    RGWRados *get_store() { return store; }
+    rgw_bucket& get_bucket() { return bucket; }
+    RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+    int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp);
+
+    int get_shard_id() { return shard_id; }
+    void set_shard_id(int id) {
+      shard_id = id;
+    }
+
+    class UpdateIndex {
+      RGWRados::Bucket *target;
+      std::string optag;
+      rgw_obj obj;
+      uint16_t bilog_flags{0};
+      BucketShard bs;
+      bool bs_initialized{false};
+      bool blind;
+      bool prepared{false};
+      rgw_zone_set *zones_trace{nullptr};
+
+      int init_bs(const DoutPrefixProvider *dpp) {
+        int r =
+         bs.init(target->get_bucket(), obj, &target->bucket_info, dpp);
+        if (r < 0) {
+          return r;
+        }
+        bs_initialized = true;
+        return 0;
+      }
+
+      void invalidate_bs() {
+        bs_initialized = false;
+      }
+
+      int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call);
+    public:
+
+      UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj),
+                                                              bs(target->get_store()) {
+                                                                blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless);
+                                                              }
+
+      int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+        if (!bs_initialized) {
+          int r = init_bs(dpp);
+          if (r < 0) {
+            return r;
+          }
+        }
+        *pbs = &bs;
+        return 0;
+      }
+
+      void set_bilog_flags(uint16_t flags) {
+        bilog_flags = flags;
+      }
+      
+      void set_zones_trace(rgw_zone_set *_zones_trace) {
+        zones_trace = _zones_trace;
+      }
+
+      int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y);
+      int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size,
+                   uint64_t accounted_size, ceph::real_time& ut,
+                   const std::string& etag, const std::string& content_type,
+                   const std::string& storage_class,
+                   bufferlist *acl_bl, RGWObjCategory category,
+                  std::list<rgw_obj_index_key> *remove_objs, const std::string *user_data = nullptr, bool appendable = false);
+      int complete_del(const DoutPrefixProvider *dpp,
+                       int64_t poolid, uint64_t epoch,
+                       ceph::real_time& removed_mtime, /* mtime of removed object */
+                       std::list<rgw_obj_index_key> *remove_objs);
+      int cancel(const DoutPrefixProvider *dpp,
+                 std::list<rgw_obj_index_key> *remove_objs);
+
+      const std::string *get_optag() { return &optag; }
+
+      bool is_prepared() { return prepared; }
+    }; // class UpdateIndex
+
+    class List {
+    protected:
+      // absolute maximum number of objects that
+      // list_objects_(un)ordered can return
+      static constexpr int64_t bucket_list_objects_absolute_max = 25000;
+
+      RGWRados::Bucket *target;
+      rgw_obj_key next_marker;
+
+      int list_objects_ordered(const DoutPrefixProvider *dpp,
+                               int64_t max,
+                              std::vector<rgw_bucket_dir_entry> *result,
+                              std::map<std::string, bool> *common_prefixes,
+                              bool *is_truncated,
+                               optional_yield y);
+      int list_objects_unordered(const DoutPrefixProvider *dpp,
+                                 int64_t max,
+                                std::vector<rgw_bucket_dir_entry> *result,
+                                std::map<std::string, bool> *common_prefixes,
+                                bool *is_truncated,
+                                 optional_yield y);
+
+    public:
+
+      struct Params {
+        std::string prefix;
+        std::string delim;
+        rgw_obj_key marker;
+        rgw_obj_key end_marker;
+        std::string ns;
+        bool enforce_ns;
+        RGWAccessListFilter* access_list_filter;
+       RGWBucketListNameFilter force_check_filter;
+        bool list_versions;
+       bool allow_unordered;
+
+        Params() :
+         enforce_ns(true),
+         access_list_filter(nullptr),
+         list_versions(false),
+         allow_unordered(false)
+       {}
+      } params;
+
+      explicit List(RGWRados::Bucket *_target) : target(_target) {}
+
+      int list_objects(const DoutPrefixProvider *dpp, int64_t max,
+                      std::vector<rgw_bucket_dir_entry> *result,
+                      std::map<std::string, bool> *common_prefixes,
+                      bool *is_truncated,
+                       optional_yield y) {
+       if (params.allow_unordered) {
+         return list_objects_unordered(dpp, max, result, common_prefixes,
+                                       is_truncated, y);
+       } else {
+         return list_objects_ordered(dpp, max, result, common_prefixes,
+                                     is_truncated, y);
+       }
+      }
+      rgw_obj_key& get_next_marker() {
+        return next_marker;
+      }
+    }; // class List
+  }; // class Bucket
+
+  int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+                               RGWBucketInfo& bucket_info,
+                               const std::string& obj_prefix,
+                               const std::string& obj_delim,
+                               std::function<int(const rgw_bucket_dir_entry&)> handler);
+
+  bool swift_versioning_enabled(rgw::sal::Bucket* bucket) const;
+
+  int swift_versioning_copy(RGWObjectCtx& obj_ctx,              /* in/out */
+                            const rgw_user& user,               /* in */
+                            rgw::sal::Bucket* bucket,        /* in */
+                            rgw::sal::Object* obj,           /* in */
+                            const DoutPrefixProvider *dpp,      /* in/out */ 
+                            optional_yield y);                  /* in */                
+  int swift_versioning_restore(RGWObjectCtx& obj_ctx,           /* in/out */
+                               const rgw_user& user,            /* in */
+                               rgw::sal::Bucket* bucket,     /* in */
+                               rgw::sal::Object* obj,        /* in */
+                               bool& restored,                 /* out */
+                               const DoutPrefixProvider *dpp);     /* in/out */                
+  int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+                              RGWObjState *astate,
+                              std::map<std::string, bufferlist>& src_attrs,
+                              RGWRados::Object::Read& read_op,
+                              const rgw_user& user_id,
+                              rgw::sal::Object* dest_obj,
+                              ceph::real_time *mtime);
+
+  enum AttrsMod {
+    ATTRSMOD_NONE    = 0,
+    ATTRSMOD_REPLACE = 1,
+    ATTRSMOD_MERGE   = 2
+  };
+
+  D3nDataCache* d3n_data_cache{nullptr};
+
+  int rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y);
+
+  int stat_remote_obj(const DoutPrefixProvider *dpp,
+               RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               rgw::sal::Object* src_obj,
+               const RGWBucketInfo *src_bucket_info,
+               real_time *src_mtime,
+               uint64_t *psize,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               std::map<std::string, bufferlist> *pattrs,
+               std::map<std::string, std::string> *pheaders,
+               std::string *version_id,
+               std::string *ptag,
+               std::string *petag);
+
+  int fetch_remote_obj(RGWObjectCtx& obj_ctx,
+                       const rgw_user& user_id,
+                       req_info *info,
+                       const rgw_zone_id& source_zone,
+                       rgw::sal::Object* dest_obj,
+                       rgw::sal::Object* src_obj,
+                      rgw::sal::Bucket* dest_bucket,
+                      rgw::sal::Bucket* src_bucket,
+                      std::optional<rgw_placement_rule> dest_placement,
+                       ceph::real_time *src_mtime,
+                       ceph::real_time *mtime,
+                       const ceph::real_time *mod_ptr,
+                       const ceph::real_time *unmod_ptr,
+                       bool high_precision_time,
+                       const char *if_match,
+                       const char *if_nomatch,
+                       AttrsMod attrs_mod,
+                       bool copy_if_newer,
+                       rgw::sal::Attrs& attrs,
+                       RGWObjCategory category,
+                       std::optional<uint64_t> olh_epoch,
+                      ceph::real_time delete_at,
+                       std::string *ptag,
+                       std::string *petag,
+                       void (*progress_cb)(off_t, void *),
+                       void *progress_data,
+                       const DoutPrefixProvider *dpp,
+                       RGWFetchObjFilter *filter,
+                       rgw_zone_set *zones_trace= nullptr,
+                       std::optional<uint64_t>* bytes_transferred = 0);
+  /**
+   * Copy an object.
+   * dest_obj: the object to copy into
+   * src_obj: the object to copy from
+   * attrs: usage depends on attrs_mod parameter
+   * attrs_mod: the modification mode of the attrs, may have the following values:
+   *            ATTRSMOD_NONE - the attributes of the source object will be
+   *                            copied without modifications, attrs parameter is ignored;
+   *            ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+   *                               parameter, source object attributes are not copied;
+   *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+   *                             are overwritten by values contained in attrs parameter.
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int copy_obj(RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               rgw::sal::Object* dest_obj,
+               rgw::sal::Object* src_obj,
+               rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               const rgw_placement_rule& dest_placement,
+               ceph::real_time *src_mtime,
+               ceph::real_time *mtime,
+               const ceph::real_time *mod_ptr,
+               const ceph::real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               AttrsMod attrs_mod,
+               bool copy_if_newer,
+               std::map<std::string, bufferlist>& attrs,
+               RGWObjCategory category,
+               uint64_t olh_epoch,
+              ceph::real_time delete_at,
+               std::string *version_id,
+               std::string *ptag,
+               std::string *petag,
+               void (*progress_cb)(off_t, void *),
+               void *progress_data,
+               const DoutPrefixProvider *dpp,
+               optional_yield y);
+
+  int copy_obj_data(RGWObjectCtx& obj_ctx,
+               rgw::sal::Bucket* bucket,
+               const rgw_placement_rule& dest_placement,
+              RGWRados::Object::Read& read_op, off_t end,
+               rgw::sal::Object* dest_obj,
+              ceph::real_time *mtime,
+              ceph::real_time set_mtime,
+               std::map<std::string, bufferlist>& attrs,
+               uint64_t olh_epoch,
+              ceph::real_time delete_at,
+               std::string *petag,
+               const DoutPrefixProvider *dpp,
+               optional_yield y);
+  
+  int transition_obj(RGWObjectCtx& obj_ctx,
+                     rgw::sal::Bucket* bucket,
+                     rgw::sal::Object& obj,
+                     const rgw_placement_rule& placement_rule,
+                     const real_time& mtime,
+                     uint64_t olh_epoch,
+                     const DoutPrefixProvider *dpp,
+                     optional_yield y);
+
+  int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
+
+  /**
+   * Delete a bucket.
+   * bucket: the name of the bucket to delete
+   * Returns 0 on success, -ERR# otherwise.
+   */
+  int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true);
+
+  void wakeup_meta_sync_shards(std::set<int>& shard_ids);
+
+  void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries);
+
+  RGWMetaSyncStatusManager* get_meta_sync_manager();
+  RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone);
+
+  int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp);
+  int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp);
+  int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended);
+
+  /** Delete an object.*/
+  int delete_obj(rgw::sal::Driver* driver,
+                const DoutPrefixProvider *dpp,
+                const RGWBucketInfo& bucket_owner,
+                const rgw_obj& src_obj,
+                int versioning_status,  // versioning flags defined in enum RGWBucketFlags
+                uint16_t bilog_flags = 0,
+                const ceph::real_time& expiration_time = ceph::real_time(),
+                rgw_zone_set *zones_trace = nullptr);
+  int delete_obj(const DoutPrefixProvider *dpp,
+                const RGWBucketInfo& bucket_owner,
+                rgw::sal::Object* src_obj,
+                int versioning_status,  // versioning flags defined in enum RGWBucketFlags
+                uint16_t bilog_flags = 0,
+                const ceph::real_time& expiration_time = ceph::real_time(),
+                rgw_zone_set *zones_trace = nullptr);
+
+  int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+
+  /** Remove an object from the bucket index */
+  int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp);
+
+  /**
+   * Set an attr on an object.
+   * bucket: name of the bucket holding the object
+   * obj: name of the object to set the attr on
+   * name: the attr to set
+   * bl: the contents of the attr
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl);
+
+  int set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+                        std::map<std::string, bufferlist>& attrs,
+                        std::map<std::string, bufferlist>* rmattrs,
+                        optional_yield y);
+
+  int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
+                    bool follow_olh, optional_yield y, bool assume_noent = false);
+  int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) {
+    return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y);
+  }
+
+  using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t,
+                                 off_t, bool, RGWObjState*, void*);
+
+  int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info,
+                  rgw::sal::Object* obj, off_t ofs, off_t end,
+                  uint64_t max_chunk_size, iterate_obj_cb cb, void *arg,
+                  optional_yield y);
+
+  int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op);
+
+  virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+                         const rgw_raw_obj& read_obj, off_t obj_ofs,
+                         off_t read_ofs, off_t len, bool is_head_obj,
+                         RGWObjState *astate, void *arg);
+
+  /**
+   * a simple object read without keeping state
+   */
+
+  int raw_obj_stat(const DoutPrefixProvider *dpp,
+                   rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch,
+                   std::map<std::string, bufferlist> *attrs, bufferlist *first_chunk,
+                   RGWObjVersionTracker *objv_tracker, optional_yield y);
+
+  int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
+  int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
+
+  int guard_reshard(const DoutPrefixProvider *dpp,
+                    BucketShard *bs,
+                   const rgw_obj& obj_instance,
+                   RGWBucketInfo& bucket_info,
+                   std::function<int(BucketShard *)> call);
+  int block_while_resharding(RGWRados::BucketShard *bs,
+                             const rgw_obj& obj_instance,
+                            RGWBucketInfo& bucket_info,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp);
+
+  void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op);
+  int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+  int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+  int bucket_index_link_olh(const DoutPrefixProvider *dpp,
+                            RGWBucketInfo& bucket_info, RGWObjState& olh_state,
+                            const rgw_obj& obj_instance, bool delete_marker,
+                            const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
+                            uint64_t olh_epoch,
+                            ceph::real_time unmod_since, bool high_precision_time,
+                            rgw_zone_set *zones_trace = nullptr,
+                            bool log_data_change = false);
+  int bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+                                   RGWBucketInfo& bucket_info,
+                                   const rgw_obj& obj_instance,
+                                   const std::string& op_tag, const std::string& olh_tag,
+                                   uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
+  int bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+                                RGWBucketInfo& bucket_info, RGWObjState& state,
+                                const rgw_obj& obj_instance, uint64_t ver_marker,
+                                std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
+  int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver);
+  int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance);
+  int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj,
+                    bufferlist& obj_tag, std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+                    uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr);
+  int update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace = nullptr);
+  int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
+              uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
+              optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
+  int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+                 const rgw_obj& obj);
+  int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
+                          uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+
+  void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& pending_entries, std::map<std::string, bufferlist> *rm_pending_entries);
+  int remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map<std::string, bufferlist>& pending_attrs);
+  int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target);
+  int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh);
+
+  void gen_rand_obj_instance_name(rgw_obj_key *target_key);
+  void gen_rand_obj_instance_name(rgw_obj *target);
+
+  int update_containers_stats(std::map<std::string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp);
+  int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl);
+
+public:
+  void set_atomic(void *ctx, rgw_obj& obj) {
+    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+    rctx->set_atomic(obj);
+  }
+  void set_prefetch_data(void *ctx, const rgw_obj& obj) {
+    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+    rctx->set_prefetch_data(obj);
+  }
+  void set_compressed(void *ctx, const rgw_obj& obj) {
+    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+    rctx->set_compressed(obj);
+  }
+  int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner);
+  int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver,
+      std::map<RGWObjCategory, RGWStorageStats>& stats, std::string *max_marker, bool* syncstopped = NULL);
+  int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb);
+
+  int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map<std::string, bufferlist> *pattrs, const DoutPrefixProvider *dpp);
+  /* xxx dang obj_ctx -> svc */
+  int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+  int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+
+  static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry);
+
+  int get_bucket_info(RGWServices *svc,
+                     const std::string& tenant_name, const std::string& bucket_name,
+                     RGWBucketInfo& info,
+                     ceph::real_time *pmtime, optional_yield y,
+                      const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *pattrs = NULL);
+
+  // Returns 0 on successful refresh. Returns error code if there was
+  // an error or the version stored on the OSD is the same as that
+  // presented in the BucketInfo structure.
+  //
+  int try_refresh_bucket_info(RGWBucketInfo& info,
+                             ceph::real_time *pmtime,
+                              const DoutPrefixProvider *dpp,
+                             std::map<std::string, bufferlist> *pattrs = nullptr);
+
+  int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
+                            std::map<std::string, bufferlist> *pattrs, bool create_entry_point,
+                             const DoutPrefixProvider *dpp);
+
+  int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch,
+                          rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent,
+                           RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj,
+                           ceph::real_time& removed_mtime, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj,
+                              std::list<rgw_obj_index_key> *remove_objs,
+                              uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout);
+
+  using ent_map_t =
+    boost::container::flat_map<std::string, rgw_bucket_dir_entry>;
+
+  int cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+                              RGWBucketInfo& bucket_info,
+                              const rgw::bucket_index_layout_generation& idx_layout,
+                              const int shard_id,
+                             const rgw_obj_index_key& start_after,
+                             const std::string& prefix,
+                             const std::string& delimiter,
+                             const uint32_t num_entries,
+                             const bool list_versions,
+                             const uint16_t exp_factor, // 0 means ignore
+                             ent_map_t& m,
+                             bool* is_truncated,
+                             bool* cls_filtered,
+                             rgw_obj_index_key *last_entry,
+                              optional_yield y,
+                             RGWBucketListNameFilter force_check_filter = {});
+  int cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+                                RGWBucketInfo& bucket_info,
+                                const rgw::bucket_index_layout_generation& idx_layout,
+                                int shard_id,
+                               const rgw_obj_index_key& start_after,
+                               const std::string& prefix,
+                               uint32_t num_entries,
+                               bool list_versions,
+                               std::vector<rgw_bucket_dir_entry>& ent_list,
+                               bool *is_truncated,
+                               rgw_obj_index_key *last_entry,
+                                optional_yield y,
+                               RGWBucketListNameFilter force_check_filter = {});
+  int cls_bucket_head(const DoutPrefixProvider *dpp,
+                     const RGWBucketInfo& bucket_info,
+                     const rgw::bucket_index_layout_generation& idx_layout,
+                     int shard_id, std::vector<rgw_bucket_dir_header>& headers,
+                     std::map<int, std::string> *bucket_instance_ids = NULL);
+  int cls_bucket_head_async(const DoutPrefixProvider *dpp,
+                           const RGWBucketInfo& bucket_info,
+                           const rgw::bucket_index_layout_generation& idx_layout,
+                           int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
+  int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
+  int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
+  int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
+  void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
+  int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
+  int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
+  int bi_list(const DoutPrefixProvider *dpp,
+             const RGWBucketInfo& bucket_info,
+             int shard_id,
+             const std::string& filter_obj,
+             const std::string& marker,
+             uint32_t max,
+             std::list<rgw_cls_bi_entry> *entries,
+             bool *is_truncated);
+  int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+  int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max,
+              std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+  int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs);
+
+  int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info);
+  int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+                             uint64_t end_epoch, uint32_t max_entries, std::string& read_iter,
+                            std::map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated);
+  int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+                             uint64_t end_epoch);
+  int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid);
+
+  int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id);
+
+  int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id);
+  int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id);
+
+  void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain);
+  std::tuple<int, std::optional<cls_rgw_obj_chain>> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag);
+  void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag);
+  int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op);
+  int gc_aio_operate(const std::string& oid, librados::AioCompletion *c,
+                     librados::ObjectWriteOperation *op);
+  int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl);
+
+  int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+  int process_gc(bool expired_only);
+  bool process_expire_objects(const DoutPrefixProvider *dpp);
+  int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y);
+
+  int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
+  int list_lc_progress(std::string& marker, uint32_t max_entries,
+                      std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+                      int& index);
+
+  int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                         std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
+                         std::map<RGWObjCategory, RGWStorageStats> *calculated_stats);
+  int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info);
+  int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+  int remove_objs_from_index(const DoutPrefixProvider *dpp,
+                            RGWBucketInfo& bucket_info,
+                            const std::list<rgw_obj_index_key>& oid_list);
+  int move_rados_obj(const DoutPrefixProvider *dpp,
+                     librados::IoCtx& src_ioctx,
+                    const std::string& src_oid, const std::string& src_locator,
+                    librados::IoCtx& dst_ioctx,
+                    const std::string& dst_oid, const std::string& dst_locator);
+  int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key);
+  int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                           rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y);
+
+  int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+                  RGWQuota& quota, uint64_t obj_size,
+                 optional_yield y, bool check_size_only = false);
+
+  int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
+                         uint64_t num_objs, const DoutPrefixProvider *dpp);
+
+  int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards);
+
+  uint64_t instance_id();
+
+  librados::Rados* get_rados_handle();
+
+  int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list<librados::AioCompletion *>& handles);
+  int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate,
+                     std::list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+                     optional_yield y);
+
+ private:
+  /**
+   * Check the actual on-disk state of the object specified
+   * by list_state, and fill in the time and size of object.
+   * Then append any changes to suggested_updates for
+   * the rgw class' dir_suggest_changes function.
+   *
+   * Note that this can maul list_state; don't use it afterwards. Also
+   * it expects object to already be filled in from list_state; it only
+   * sets the size and mtime.
+   *
+   * Returns 0 on success, -ENOENT if the object doesn't exist on disk,
+   * and -errno on other failures. (-ENOENT is not a failure, and it
+   * will encode that info as a suggested update.)
+   */
+  int check_disk_state(const DoutPrefixProvider *dpp,
+                       librados::IoCtx io_ctx,
+                       RGWBucketInfo& bucket_info,
+                       rgw_bucket_dir_entry& list_state,
+                       rgw_bucket_dir_entry& object,
+                       bufferlist& suggested_updates,
+                       optional_yield y);
+
+  /**
+   * Init pool iteration
+   * pool: pool to use for the ctx initialization
+   * ctx: context object to use for the iteration
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx);
+
+  /**
+   * Init pool iteration
+   * pool: pool to use
+   * cursor: position to start iteration
+   * ctx: context object to use for the iteration
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx);
+
+  /**
+   * Get pool iteration position
+   * ctx: context object to use for the iteration
+   * Returns: std::string representation of position
+   */
+  std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
+
+  /**
+   * Iterate over pool return object names, use optional filter
+   * ctx: iteration context, initialized with pool_iterate_begin()
+   * num: max number of objects to return
+   * objs: a vector that the results will append into
+   * is_truncated: if not NULL, will hold true iff iteration is complete
+   * filter: if not NULL, will be used to filter returned objects
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num,
+                  std::vector<rgw_bucket_dir_entry>& objs,
+                   bool *is_truncated, RGWAccessListFilter *filter);
+
+  uint64_t next_bucket_id();
+
+  /**
+   * This is broken out to facilitate unit testing.
+   */
+  static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+                                                    uint32_t num_shards);
+};
+
+
+struct get_obj_data {
+  RGWRados* rgwrados;
+  RGWGetDataCB* client_cb = nullptr;
+  rgw::Aio* aio;
+  uint64_t offset; // next offset to write to client
+  rgw::AioResultList completed; // completed read results, sorted by offset
+  optional_yield yield;
+
+  get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio,
+               uint64_t offset, optional_yield yield)
+               : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
+  ~get_obj_data() {
+    if (rgwrados->get_use_datacache()) {
+      const std::lock_guard l(d3n_get_data.d3n_lock);
+    }
+  }
+
+  D3nGetObjData d3n_get_data;
+  std::atomic_bool d3n_bypass_cache_write{false};
+
+  int flush(rgw::AioResultList&& results);
+
+  void cancel() {
+    // wait for all completions to drain and ignore the results
+    aio->drain();
+  }
+
+  int drain() {
+    auto c = aio->wait();
+    while (!c.empty()) {
+      int r = flush(std::move(c));
+      if (r < 0) {
+        cancel();
+        return r;
+      }
+      c = aio->wait();
+    }
+    return flush(std::move(c));
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc

new file mode 100644 (file)

index 0000000..b2dec7a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -0,0 +1,1407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <limits>
+#include <sstream>
+
+#include "rgw_zone.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_reshard.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+
+#include "common/dout.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_bilog_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const string reshard_oid_prefix = "reshard.";
+const string reshard_lock_name = "reshard_process";
+const string bucket_instance_lock_name = "bucket_instance_lock";
+
+/* All primes up to 2000 used to attempt to make dynamic sharding use
+ * a prime numbers of shards. Note: this list also includes 1 for when
+ * 1 shard is the most appropriate, even though 1 is not prime.
+ */
+const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
+  1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
+  67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
+  139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
+  223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
+  293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
+  383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
+  463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
+  569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
+  647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
+  743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
+  839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
+  941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
+  1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
+  1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
+  1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
+  1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+  1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
+  1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
+  1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
+  1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
+  1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
+  1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
+  1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
+  1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
+};
+
+class BucketReshardShard {
+  rgw::sal::RadosStore* store;
+  const RGWBucketInfo& bucket_info;
+  int shard_id;
+  RGWRados::BucketShard bs;
+  vector<rgw_cls_bi_entry> entries;
+  map<RGWObjCategory, rgw_bucket_category_stats> stats;
+  deque<librados::AioCompletion *>& aio_completions;
+  uint64_t max_aio_completions;
+  uint64_t reshard_shard_batch_size;
+
+  int wait_next_completion() {
+    librados::AioCompletion *c = aio_completions.front();
+    aio_completions.pop_front();
+
+    c->wait_for_complete();
+
+    int ret = c->get_return_value();
+    c->release();
+
+    if (ret < 0) {
+      derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int get_completion(librados::AioCompletion **c) {
+    if (aio_completions.size() >= max_aio_completions) {
+      int ret = wait_next_completion();
+      if (ret < 0) {
+        return ret;
+      }
+    }
+
+    *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+    aio_completions.push_back(*c);
+
+    return 0;
+  }
+
+public:
+  BucketReshardShard(const DoutPrefixProvider *dpp,
+                    rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info,
+                     const rgw::bucket_index_layout_generation& index,
+                     int shard_id, deque<librados::AioCompletion *>& _completions) :
+    store(_store), bucket_info(_bucket_info), shard_id(shard_id),
+    bs(store->getRados()), aio_completions(_completions)
+  {
+    bs.init(dpp, bucket_info, index, shard_id);
+
+    max_aio_completions =
+      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
+    reshard_shard_batch_size =
+      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
+  }
+
+  int get_shard_id() const {
+    return shard_id;
+  }
+
+  int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+                const rgw_bucket_category_stats& entry_stats) {
+    entries.push_back(entry);
+    if (account) {
+      rgw_bucket_category_stats& target = stats[category];
+      target.num_entries += entry_stats.num_entries;
+      target.total_size += entry_stats.total_size;
+      target.total_size_rounded += entry_stats.total_size_rounded;
+      target.actual_size += entry_stats.actual_size;
+    }
+    if (entries.size() >= reshard_shard_batch_size) {
+      int ret = flush();
+      if (ret < 0) {
+        return ret;
+      }
+    }
+
+    return 0;
+  }
+
+  int flush() {
+    if (entries.size() == 0) {
+      return 0;
+    }
+
+    librados::ObjectWriteOperation op;
+    for (auto& entry : entries) {
+      store->getRados()->bi_put(op, bs, entry);
+    }
+    cls_rgw_bucket_update_stats(op, false, stats);
+
+    librados::AioCompletion *c;
+    int ret = get_completion(&c);
+    if (ret < 0) {
+      return ret;
+    }
+    ret = bs.bucket_obj.aio_operate(c, &op);
+    if (ret < 0) {
+      derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    entries.clear();
+    stats.clear();
+    return 0;
+  }
+
+  int wait_all_aio() {
+    int ret = 0;
+    while (!aio_completions.empty()) {
+      int r = wait_next_completion();
+      if (r < 0) {
+        ret = r;
+      }
+    }
+    return ret;
+  }
+}; // class BucketReshardShard
+
+
+class BucketReshardManager {
+  rgw::sal::RadosStore *store;
+  deque<librados::AioCompletion *> completions;
+  vector<BucketReshardShard> target_shards;
+
+public:
+  BucketReshardManager(const DoutPrefixProvider *dpp,
+                      rgw::sal::RadosStore *_store,
+                      const RGWBucketInfo& bucket_info,
+                       const rgw::bucket_index_layout_generation& target)
+    : store(_store)
+  {
+    const int num_shards = target.layout.normal.num_shards;
+    target_shards.reserve(num_shards);
+    for (int i = 0; i < num_shards; ++i) {
+      target_shards.emplace_back(dpp, store, bucket_info, target, i, completions);
+    }
+  }
+
+  ~BucketReshardManager() {
+    for (auto& shard : target_shards) {
+      int ret = shard.wait_all_aio();
+      if (ret < 0) {
+        ldout(store->ctx(), 20) << __func__ <<
+         ": shard->wait_all_aio() returned ret=" << ret << dendl;
+      }
+    }
+  }
+
+  int add_entry(int shard_index,
+                rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+                const rgw_bucket_category_stats& entry_stats) {
+    int ret = target_shards[shard_index].add_entry(entry, account, category,
+                                                  entry_stats);
+    if (ret < 0) {
+      derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+       ") returned error: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int finish() {
+    int ret = 0;
+    for (auto& shard : target_shards) {
+      int r = shard.flush();
+      if (r < 0) {
+        derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
+        ret = r;
+      }
+    }
+    for (auto& shard : target_shards) {
+      int r = shard.wait_all_aio();
+      if (r < 0) {
+        derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
+        ret = r;
+      }
+    }
+    target_shards.clear();
+    return ret;
+  }
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store,
+                                  const RGWBucketInfo& _bucket_info,
+                                  const std::map<std::string, bufferlist>& _bucket_attrs,
+                                  RGWBucketReshardLock* _outer_reshard_lock) :
+  store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+  reshard_lock(store, bucket_info, true),
+  outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+// sets reshard status of bucket index shards for the current index layout
+static int set_resharding_status(const DoutPrefixProvider *dpp,
+                                rgw::sal::RadosStore* store,
+                                const RGWBucketInfo& bucket_info,
+                                 cls_rgw_reshard_status status)
+{
+  cls_rgw_bucket_instance_entry instance_entry;
+  instance_entry.set_status(status);
+
+  int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
+                 << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+static int remove_old_reshard_instance(rgw::sal::RadosStore* store,
+                                       const rgw_bucket& bucket,
+                                       const DoutPrefixProvider* dpp)
+{
+  RGWBucketInfo info;
+  int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr,
+                                                      nullptr, null_yield, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  // delete its shard objects (ignore errors)
+  store->svc()->bi->clean_index(dpp, info, info.layout.current_index);
+  // delete the bucket instance metadata
+  return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp);
+}
+
+// initialize the new bucket index shard objects
+static int init_target_index(rgw::sal::RadosStore* store,
+                             RGWBucketInfo& bucket_info,
+                             const rgw::bucket_index_layout_generation& index,
+                             const DoutPrefixProvider* dpp)
+{
+  int ret = store->svc()->bi->init_index(dpp, bucket_info, index);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize "
+       "target index shard objects: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  if (!bucket_info.datasync_flag_enabled()) {
+    // if bucket sync is disabled, disable it on each of the new shards too
+    auto log = rgw::log_layout_from_index(0, index);
+    ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable "
+          "bucket sync on the target index shard objects: "
+          << cpp_strerror(ret) << dendl;
+      store->svc()->bi->clean_index(dpp, bucket_info, index);
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+// initialize a target index layout, create its bucket index shard objects, and
+// write the target layout to the bucket instance metadata
+static int init_target_layout(rgw::sal::RadosStore* store,
+                              RGWBucketInfo& bucket_info,
+                             std::map<std::string, bufferlist>& bucket_attrs,
+                              ReshardFaultInjector& fault,
+                              uint32_t new_num_shards,
+                              const DoutPrefixProvider* dpp)
+{
+  auto prev = bucket_info.layout; // make a copy for cleanup
+  const auto current = prev.current_index;
+
+  // initialize a new normal target index layout generation
+  rgw::bucket_index_layout_generation target;
+  target.layout.type = rgw::BucketIndexType::Normal;
+  target.layout.normal.num_shards = new_num_shards;
+  target.gen = current.gen + 1;
+
+  if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
+    // backward-compatible cleanup of old reshards, where the target was in a
+    // different bucket instance
+    if (!bucket_info.new_bucket_instance_id.empty()) {
+      rgw_bucket new_bucket = bucket_info.bucket;
+      new_bucket.bucket_id = bucket_info.new_bucket_instance_id;
+      ldout(store->ctx(), 10) << __func__ << " removing target bucket instance "
+          "from a previous reshard attempt" << dendl;
+      // ignore errors
+      remove_old_reshard_instance(store, new_bucket, dpp);
+    }
+    bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
+  }
+
+  if (bucket_info.layout.target_index) {
+    // a previous reshard failed or stalled, and its reshard lock dropped
+    ldpp_dout(dpp, 10) << __func__ << " removing existing target index "
+        "objects from a previous reshard attempt" << dendl;
+    // delete its existing shard objects (ignore errors)
+    store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index);
+    // don't reuse this same generation in the new target layout, in case
+    // something is still trying to operate on its shard objects
+    target.gen = bucket_info.layout.target_index->gen + 1;
+  }
+
+  // create the index shard objects
+  int ret = init_target_index(store, bucket_info, target, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // retry in case of racing writes to the bucket instance metadata
+  static constexpr auto max_retries = 10;
+  int tries = 0;
+  do {
+    // update resharding state
+    bucket_info.layout.target_index = target;
+    bucket_info.layout.resharding = rgw::BucketReshardState::InProgress;
+
+    if (ret = fault.check("set_target_layout");
+        ret == 0) { // no fault injected, write the bucket instance metadata
+      ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+                                                        real_time(), &bucket_attrs, dpp);
+    } else if (ret == -ECANCELED) {
+      fault.clear(); // clear the fault so a retry can succeed
+    }
+
+    if (ret == -ECANCELED) {
+      // racing write detected, read the latest bucket info and try again
+      int ret2 = store->getRados()->get_bucket_instance_info(
+          bucket_info.bucket, bucket_info,
+          nullptr, &bucket_attrs, null_yield, dpp);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+            "bucket info: " << cpp_strerror(ret2) << dendl;
+        ret = ret2;
+        break;
+      }
+
+      // check that we're still in the reshard state we started in
+      if (bucket_info.layout.resharding != rgw::BucketReshardState::None ||
+          bucket_info.layout.current_index != current) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "another reshard" << dendl;
+        break;
+      }
+
+      prev = bucket_info.layout; // update the copy
+    }
+    ++tries;
+  } while (ret == -ECANCELED && tries < max_retries);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write "
+        "target index layout to bucket info: " << cpp_strerror(ret) << dendl;
+
+    bucket_info.layout = std::move(prev);  // restore in-memory layout
+
+    // delete the target shard objects (ignore errors)
+    store->svc()->bi->clean_index(dpp, bucket_info, target);
+    return ret;
+  }
+  return 0;
+} // init_target_layout
+
+// delete the bucket index shards associated with the target layout and remove
+// it from the bucket instance metadata
+static int revert_target_layout(rgw::sal::RadosStore* store,
+                                RGWBucketInfo& bucket_info,
+                               std::map<std::string, bufferlist>& bucket_attrs,
+                                ReshardFaultInjector& fault,
+                                const DoutPrefixProvider* dpp)
+{
+  auto prev = bucket_info.layout; // make a copy for cleanup
+
+  // remove target index shard objects
+  int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove "
+        "target index with: " << cpp_strerror(ret) << dendl;
+    ret = 0; // non-fatal error
+  }
+
+  // retry in case of racing writes to the bucket instance metadata
+  static constexpr auto max_retries = 10;
+  int tries = 0;
+  do {
+    // clear target_index and resharding state
+    bucket_info.layout.target_index = std::nullopt;
+    bucket_info.layout.resharding = rgw::BucketReshardState::None;
+
+    if (ret = fault.check("revert_target_layout");
+        ret == 0) { // no fault injected, revert the bucket instance metadata
+      ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+                                                        real_time(),
+                                                        &bucket_attrs, dpp);
+    } else if (ret == -ECANCELED) {
+      fault.clear(); // clear the fault so a retry can succeed
+    }
+
+    if (ret == -ECANCELED) {
+      // racing write detected, read the latest bucket info and try again
+      int ret2 = store->getRados()->get_bucket_instance_info(
+          bucket_info.bucket, bucket_info,
+          nullptr, &bucket_attrs, null_yield, dpp);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+            "bucket info: " << cpp_strerror(ret2) << dendl;
+        ret = ret2;
+        break;
+      }
+
+      // check that we're still in the reshard state we started in
+      if (bucket_info.layout.resharding == rgw::BucketReshardState::None) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "reshard cancel" << dendl;
+        return -ECANCELED;
+      }
+      if (bucket_info.layout.current_index != prev.current_index ||
+          bucket_info.layout.target_index != prev.target_index) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "another reshard" << dendl;
+        return -ECANCELED;
+      }
+
+      prev = bucket_info.layout; // update the copy
+    }
+    ++tries;
+  } while (ret == -ECANCELED && tries < max_retries);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear "
+        "target index layout in bucket info: " << cpp_strerror(ret) << dendl;
+
+    bucket_info.layout = std::move(prev);  // restore in-memory layout
+    return ret;
+  }
+  return 0;
+} // remove_target_layout
+
+static int init_reshard(rgw::sal::RadosStore* store,
+                        RGWBucketInfo& bucket_info,
+                       std::map<std::string, bufferlist>& bucket_attrs,
+                        ReshardFaultInjector& fault,
+                        uint32_t new_num_shards,
+                        const DoutPrefixProvider *dpp)
+{
+  int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (ret = fault.check("block_writes");
+      ret == 0) { // no fault injected, block writes to the current index shards
+    ret = set_resharding_status(dpp, store, bucket_info,
+                                cls_rgw_reshard_status::IN_PROGRESS);
+  }
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause "
+        "writes to the current index: " << cpp_strerror(ret) << dendl;
+    // clean up the target layout (ignore errors)
+    revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+    return ret;
+  }
+  return 0;
+} // init_reshard
+
+static int cancel_reshard(rgw::sal::RadosStore* store,
+                          RGWBucketInfo& bucket_info,
+                         std::map<std::string, bufferlist>& bucket_attrs,
+                          ReshardFaultInjector& fault,
+                          const DoutPrefixProvider *dpp)
+{
+  // unblock writes to the current index shard objects
+  int ret = set_resharding_status(dpp, store, bucket_info,
+                                  cls_rgw_reshard_status::NOT_RESHARDING);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+        "writes to current index objects: " << cpp_strerror(ret) << dendl;
+    ret = 0; // non-fatal error
+  }
+
+  if (bucket_info.layout.target_index) {
+    return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+  }
+  // there is nothing to revert
+  return 0;
+} // cancel_reshard
+
+static int commit_target_layout(rgw::sal::RadosStore* store,
+                                RGWBucketInfo& bucket_info,
+                                std::map<std::string, bufferlist>& bucket_attrs,
+                                ReshardFaultInjector& fault,
+                                const DoutPrefixProvider *dpp)
+{
+  auto& layout = bucket_info.layout;
+  const auto next_log_gen = layout.logs.empty() ? 1 :
+      layout.logs.back().gen + 1;
+
+  if (!store->svc()->zone->need_to_log_data()) {
+    // if we're not syncing data, we can drop any existing logs
+    layout.logs.clear();
+  }
+
+  // use the new index layout as current
+  ceph_assert(layout.target_index);
+  layout.current_index = std::move(*layout.target_index);
+  layout.target_index = std::nullopt;
+  layout.resharding = rgw::BucketReshardState::None;
+  // add the in-index log layout
+  layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index));
+
+  int ret = fault.check("commit_target_layout");
+  if (ret == 0) { // no fault injected, write the bucket instance metadata
+    ret = store->getRados()->put_bucket_instance_info(
+        bucket_info, false, real_time(), &bucket_attrs, dpp);
+  } else if (ret == -ECANCELED) {
+    fault.clear(); // clear the fault so a retry can succeed
+  }
+  return ret;
+} // commit_target_layout
+
+static int commit_reshard(rgw::sal::RadosStore* store,
+                          RGWBucketInfo& bucket_info,
+                         std::map<std::string, bufferlist>& bucket_attrs,
+                          ReshardFaultInjector& fault,
+                          const DoutPrefixProvider *dpp)
+{
+  auto prev = bucket_info.layout; // make a copy for cleanup
+
+  // retry in case of racing writes to the bucket instance metadata
+  static constexpr auto max_retries = 10;
+  int tries = 0;
+  int ret = 0;
+  do {
+    ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+    if (ret == -ECANCELED) {
+      // racing write detected, read the latest bucket info and try again
+      int ret2 = store->getRados()->get_bucket_instance_info(
+          bucket_info.bucket, bucket_info,
+          nullptr, &bucket_attrs, null_yield, dpp);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+            "bucket info: " << cpp_strerror(ret2) << dendl;
+        ret = ret2;
+        break;
+      }
+
+      // check that we're still in the reshard state we started in
+      if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "reshard cancel" << dendl;
+        return -ECANCELED; // whatever canceled us already did the cleanup
+      }
+      if (bucket_info.layout.current_index != prev.current_index ||
+          bucket_info.layout.target_index != prev.target_index) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "another reshard" << dendl;
+        return -ECANCELED; // whatever canceled us already did the cleanup
+      }
+
+      prev = bucket_info.layout; // update the copy
+    }
+    ++tries;
+  } while (ret == -ECANCELED && tries < max_retries);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit "
+        "target index layout: " << cpp_strerror(ret) << dendl;
+
+    bucket_info.layout = std::move(prev); // restore in-memory layout
+
+    // unblock writes to the current index shard objects
+    int ret2 = set_resharding_status(dpp, store, bucket_info,
+                                     cls_rgw_reshard_status::NOT_RESHARDING);
+    if (ret2 < 0) {
+      ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+          "writes to current index objects: " << cpp_strerror(ret2) << dendl;
+      // non-fatal error
+    }
+    return ret;
+  }
+
+  if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() &&
+      prev.current_index.layout.type == rgw::BucketIndexType::Normal) {
+    // write a datalog entry for each shard of the previous index. triggering
+    // sync on the old shards will force them to detect the end-of-log for that
+    // generation, and eventually transition to the next
+    // TODO: use a log layout to support types other than BucketLogType::InIndex
+    for (uint32_t shard_id = 0; shard_id < prev.current_index.layout.normal.num_shards; ++shard_id) {
+      ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id);
+      if (ret < 0) {
+        ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket="
+        << bucket_info.bucket << ", shard_id=" << shard_id << "of generation="
+        << prev.logs.back().gen << ")" << dendl;
+      } // datalog error is not fatal
+    }
+  }
+
+  // check whether the old index objects are still needed for bilogs
+  const auto& logs = bucket_info.layout.logs;
+  auto log = std::find_if(logs.begin(), logs.end(),
+      [&prev] (const rgw::bucket_log_layout_generation& log) {
+        return log.layout.type == rgw::BucketLogType::InIndex
+            && log.layout.in_index.gen == prev.current_index.gen;
+      });
+  if (log == logs.end()) {
+    // delete the index objects (ignore errors)
+    store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index);
+  }
+  return 0;
+} // commit_reshard
+
+int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store,
+                                       RGWBucketInfo& bucket_info,
+                                      std::map<std::string, bufferlist>& bucket_attrs,
+                                       const DoutPrefixProvider* dpp)
+{
+  ReshardFaultInjector no_fault;
+  return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp);
+}
+
+int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp)
+{
+  int ret = reshard_lock.lock(dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+    ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl;
+    ret = -EINVAL;
+  } else {
+    ret = clear_resharding(store, bucket_info, bucket_attrs, dpp);
+  }
+
+  reshard_lock.unlock();
+  return ret;
+}
+
+RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+                                          const std::string& reshard_lock_oid,
+                                          bool _ephemeral) :
+  store(_store),
+  lock_oid(reshard_lock_oid),
+  ephemeral(_ephemeral),
+  internal_lock(reshard_lock_name)
+{
+  const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
+    "rgw_reshard_bucket_lock_duration");
+  duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+  char cookie_buf[COOKIE_LEN + 1];
+  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+  cookie_buf[COOKIE_LEN] = '\0';
+
+  internal_lock.set_cookie(cookie_buf);
+  internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) {
+  internal_lock.set_must_renew(false);
+
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+                                                lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+  }
+
+  if (ret == -EBUSY) {
+    ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ <<
+      " found lock on " << lock_oid <<
+      " to be held by another RGW process; skipping for now" << dendl;
+    return ret;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ <<
+      " failed to acquire lock on " << lock_oid << ": " <<
+      cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  reset_time(Clock::now());
+
+  return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+  int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+      " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+  }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+  internal_lock.set_must_renew(true);
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+                                                lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+  }
+  if (ret < 0) { /* expired or already locked by another processor */
+    std::stringstream error_s;
+    if (-ENOENT == ret) {
+      error_s << "ENOENT (lock expired or never initially locked)";
+    } else {
+      error_s << ret << " (" << cpp_strerror(-ret) << ")";
+    }
+    ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+      lock_oid << " with error " << error_s.str() << dendl;
+    return ret;
+  }
+  internal_lock.set_must_renew(false);
+
+  reset_time(now);
+  ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+    lock_oid << dendl;
+
+  return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current,
+                                 const rgw::bucket_index_layout_generation& target,
+                                 int max_entries,
+                                bool verbose,
+                                ostream *out,
+                                Formatter *formatter,
+                                 const DoutPrefixProvider *dpp)
+{
+  if (out) {
+    (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
+    (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
+  }
+
+  /* update bucket info -- in progress*/
+  list<rgw_cls_bi_entry> entries;
+
+  if (max_entries < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      ": can't reshard, negative max_entries" << dendl;
+    return -EINVAL;
+  }
+
+  BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target);
+
+  bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
+
+  if (verbose_json_out) {
+    formatter->open_array_section("entries");
+  }
+
+  uint64_t total_entries = 0;
+
+  if (!verbose_json_out && out) {
+    (*out) << "total entries:";
+  }
+
+  const int num_source_shards = current.layout.normal.num_shards;
+  string marker;
+  for (int i = 0; i < num_source_shards; ++i) {
+    bool is_truncated = true;
+    marker.clear();
+    const std::string null_object_filter; // empty string since we're not filtering by object
+    while (is_truncated) {
+      entries.clear();
+      int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated);
+      if (ret < 0 && ret != -ENOENT) {
+        derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
+        return ret;
+      }
+
+      for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
+       rgw_cls_bi_entry& entry = *iter;
+       if (verbose_json_out) {
+         formatter->open_object_section("entry");
+
+         encode_json("shard_id", i, formatter);
+         encode_json("num_entry", total_entries, formatter);
+         encode_json("entry", entry, formatter);
+       }
+       total_entries++;
+
+       marker = entry.idx;
+
+       int target_shard_id;
+       cls_rgw_obj_key cls_key;
+       RGWObjCategory category;
+       rgw_bucket_category_stats stats;
+       bool account = entry.get_info(&cls_key, &category, &stats);
+       rgw_obj_key key(cls_key);
+       if (entry.type == BIIndexType::OLH && key.empty()) {
+         // bogus entry created by https://tracker.ceph.com/issues/46456
+         // to fix, skip so it doesn't get include in the new bucket instance
+         total_entries--;
+         ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl;
+         continue;
+       }
+       rgw_obj obj(bucket_info.bucket, key);
+       RGWMPObj mp;
+       if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
+         // place the multipart .meta object on the same shard as its head object
+         obj.index_hash_source = mp.get_key();
+       }
+       ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal,
+                                                    obj.get_hash_object(), &target_shard_id);
+       if (ret < 0) {
+         ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
+         return ret;
+       }
+
+       int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
+
+       ret = target_shards_mgr.add_entry(shard_index, entry, account,
+                                         category, stats);
+       if (ret < 0) {
+         return ret;
+       }
+
+       Clock::time_point now = Clock::now();
+       if (reshard_lock.should_renew(now)) {
+         // assume outer locks have timespans at least the size of ours, so
+         // can call inside conditional
+         if (outer_reshard_lock) {
+           ret = outer_reshard_lock->renew(now);
+           if (ret < 0) {
+             return ret;
+           }
+         }
+         ret = reshard_lock.renew(now);
+         if (ret < 0) {
+           ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl;
+           return ret;
+         }
+       }
+       if (verbose_json_out) {
+         formatter->close_section();
+         formatter->flush(*out);
+       } else if (out && !(total_entries % 1000)) {
+         (*out) << " " << total_entries;
+       }
+      } // entries loop
+    }
+  }
+
+  if (verbose_json_out) {
+    formatter->close_section();
+    formatter->flush(*out);
+  } else if (out) {
+    (*out) << " " << total_entries << std::endl;
+  }
+
+  int ret = target_shards_mgr.finish();
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl;
+    return -EIO;
+  }
+  return 0;
+} // RGWBucketReshard::do_reshard
+
+int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list<cls_rgw_bucket_instance_entry> *status)
+{
+  return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status);
+}
+
+int RGWBucketReshard::execute(int num_shards,
+                              ReshardFaultInjector& fault,
+                              int max_op_entries,
+                              const DoutPrefixProvider *dpp,
+                              bool verbose, ostream *out,
+                              Formatter *formatter,
+                              RGWReshard* reshard_log)
+{
+  // take a reshard lock on the bucket
+  int ret = reshard_lock.lock(dpp);
+  if (ret < 0) {
+    return ret;
+  }
+  // unlock when scope exits
+  auto unlock = make_scope_guard([this] { reshard_lock.unlock(); });
+
+  if (reshard_log) {
+    ret = reshard_log->update(dpp, bucket_info);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  // prepare the target index and add its layout the bucket info
+  ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (ret = fault.check("do_reshard");
+      ret == 0) { // no fault injected, do the reshard
+    ret = do_reshard(bucket_info.layout.current_index,
+                     *bucket_info.layout.target_index,
+                     max_op_entries, verbose, out, formatter, dpp);
+  }
+
+  if (ret < 0) {
+    cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+
+    ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+        << bucket_info.bucket.name << "\" canceled due to errors" << dendl;
+    return ret;
+  }
+
+  ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+      << bucket_info.bucket.name << "\" completed successfully" << dendl;
+  return 0;
+} // execute
+
+bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket,
+                                   const RGWSI_Zone* zone_svc)
+{
+  return !zone_svc->need_to_log_data() ||
+      bucket.layout.logs.size() < max_bilog_history;
+}
+
+
+RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out,
+                       Formatter *_formatter) :
+  store(_store), instance_lock(bucket_instance_lock_name),
+  verbose(_verbose), out(_out), formatter(_formatter)
+{
+  num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
+}
+
+string RGWReshard::get_logshard_key(const string& tenant,
+                                   const string& bucket_name)
+{
+  return tenant + ":" + bucket_name;
+}
+
+#define MAX_RESHARD_LOGSHARDS_PRIME 7877
+
+void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
+{
+  string key = get_logshard_key(tenant, bucket_name);
+
+  uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
+  uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+  sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
+
+  get_logshard_oid(int(sid), oid);
+}
+
+int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+  if (!store->svc()->zone->can_reshard()) {
+    ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled"  << dendl;
+    return 0;
+  }
+
+  string logshard_oid;
+
+  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+  librados::ObjectWriteOperation op;
+  cls_rgw_reshard_add(op, entry);
+
+  int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info)
+{
+  cls_rgw_reshard_entry entry;
+  entry.bucket_name = bucket_info.bucket.name;
+  entry.bucket_id = bucket_info.bucket.bucket_id;
+  entry.tenant = bucket_info.owner.tenant;
+
+  int ret = get(dpp, entry);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = add(dpp, entry);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
+      cpp_strerror(-ret) << dendl;
+  }
+
+  return ret;
+}
+
+
+int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
+{
+  string logshard_oid;
+
+  get_logshard_oid(logshard_num, &logshard_oid);
+
+  int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
+
+  if (ret == -ENOENT) {
+    // these shard objects aren't created until we actually write something to
+    // them, so treat ENOENT as a successful empty listing
+    *is_truncated = false;
+    ret = 0;
+  } else if (ret == -EACCES) {
+    ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
+                      << ". Fix the pool access permissions of your client" << dendl;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid="
+        << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl;
+  }
+
+  return ret;
+}
+
+int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+  string logshard_oid;
+
+  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+  int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
+  if (ret < 0) {
+    if (ret != -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
+       " bucket=" << entry.bucket_name << dendl;
+    }
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry)
+{
+  string logshard_oid;
+
+  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+  librados::ObjectWriteOperation op;
+  cls_rgw_reshard_remove(op, entry);
+
+  int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+    return ret;
+  }
+
+  return ret;
+}
+
+int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
+{
+  int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWReshardWait::wait(optional_yield y)
+{
+  std::unique_lock lock(mutex);
+
+  if (going_down) {
+    return -ECANCELED;
+  }
+
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+
+    Waiter waiter(context);
+    waiters.push_back(waiter);
+    lock.unlock();
+
+    waiter.timer.expires_after(duration);
+
+    boost::system::error_code ec;
+    waiter.timer.async_wait(yield[ec]);
+
+    lock.lock();
+    waiters.erase(waiters.iterator_to(waiter));
+    return -ec.value();
+  }
+
+  cond.wait_for(lock, duration);
+
+  if (going_down) {
+    return -ECANCELED;
+  }
+
+  return 0;
+}
+
+void RGWReshardWait::stop()
+{
+  std::scoped_lock lock(mutex);
+  going_down = true;
+  cond.notify_all();
+  for (auto& waiter : waiters) {
+    // unblock any waiters with ECANCELED
+    waiter.timer.cancel();
+  }
+}
+
+int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
+                              int max_entries, const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 20) << __func__ << " resharding " <<
+      entry.bucket_name  << dendl;
+
+  rgw_bucket bucket;
+  RGWBucketInfo bucket_info;
+  std::map<std::string, bufferlist> bucket_attrs;
+
+  int ret = store->getRados()->get_bucket_info(store->svc(),
+                                               entry.tenant,
+                                              entry.bucket_name,
+                                               bucket_info, nullptr,
+                                               null_yield, dpp,
+                                              &bucket_attrs);
+  if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) <<  __func__ <<
+          ": Error in get_bucket_info for bucket " << entry.bucket_name <<
+          ": " << cpp_strerror(-ret) << dendl;
+      if (ret != -ENOENT) {
+        // any error other than ENOENT will abort
+        return ret;
+      }
+    } else {
+      ldpp_dout(dpp, 0) << __func__ <<
+          ": Bucket: " << entry.bucket_name <<
+          " already resharded by someone, skipping " << dendl;
+    }
+
+    // we've encountered a reshard queue entry for an apparently
+    // non-existent bucket; let's try to recover by cleaning up
+    ldpp_dout(dpp, 0) <<  __func__ <<
+        ": removing reshard queue entry for a resharded or non-existent bucket" <<
+        entry.bucket_name << dendl;
+
+    ret = remove(dpp, entry);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+          ": Error removing non-existent bucket " <<
+          entry.bucket_name << " from resharding queue: " <<
+          cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    // we cleaned up, move on to the next entry
+    return 0;
+  }
+
+  if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) {
+    ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not "
+        "eligible for resharding until peer zones finish syncing one "
+        "or more of its old log generations" << dendl;
+    return remove(dpp, entry);
+  }
+
+  RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
+
+  ReshardFaultInjector f; // no fault injected
+  ret = br.execute(entry.new_num_shards, f, max_entries, dpp,
+                   false, nullptr, nullptr, this);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) <<  __func__ <<
+        ": Error during resharding bucket " << entry.bucket_name << ":" <<
+        cpp_strerror(-ret)<< dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << __func__ <<
+      " removing reshard queue entry for bucket " << entry.bucket_name <<
+      dendl;
+
+  ret = remove(dpp, entry);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " <<
+        entry.bucket_name << " from resharding queue: " <<
+        cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp)
+{
+  string marker;
+  bool truncated = true;
+
+  constexpr uint32_t max_entries = 1000;
+
+  string logshard_oid;
+  get_logshard_oid(logshard_num, &logshard_oid);
+
+  RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+  int ret = logshard_lock.lock(dpp);
+  if (ret < 0) { 
+    ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " <<
+      logshard_oid << ", ret = " << ret <<dendl;
+    return ret;
+  }
+  
+  do {
+    std::list<cls_rgw_reshard_entry> entries;
+    ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" <<
+       logshard_oid << dendl;
+      continue;
+    }
+
+    for(auto& entry: entries) { // logshard entries
+      process_entry(entry, max_entries, dpp);
+      if (ret < 0) {
+        return ret;
+      }
+
+      Clock::time_point now = Clock::now();
+      if (logshard_lock.should_renew(now)) {
+        ret = logshard_lock.renew(now);
+        if (ret < 0) {
+          return ret;
+        }
+      }
+
+      entry.get_key(&marker);
+    } // entry for loop
+  } while (truncated);
+
+  logshard_lock.unlock();
+  return 0;
+}
+
+
+void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
+
+  string objname(reshard_oid_prefix);
+  *logshard =  objname + buf;
+}
+
+int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+
+  for (int i = 0; i < num_logshards; i++) {
+    string logshard;
+    get_logshard_oid(i, &logshard);
+
+    ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl;
+
+    ret = process_single_logshard(i, dpp);
+
+    ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
+  }
+
+  return 0;
+}
+
+bool RGWReshard::going_down()
+{
+  return down_flag;
+}
+
+void RGWReshard::start_processor()
+{
+  worker = new ReshardWorker(store->ctx(), this);
+  worker->create("rgw_reshard");
+}
+
+void RGWReshard::stop_processor()
+{
+  down_flag = true;
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = nullptr;
+}
+
+void *RGWReshard::ReshardWorker::entry() {
+  do {
+    utime_t start = ceph_clock_now();
+    reshard->process_all_logshards(this);
+
+    if (reshard->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    end -= start;
+    int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    std::unique_lock locker{lock};
+    cond.wait_for(locker, std::chrono::seconds(secs));
+  } while (!reshard->going_down());
+
+  return NULL;
+}
+
+void RGWReshard::ReshardWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
+
+CephContext *RGWReshard::ReshardWorker::get_cct() const
+{
+  return cct;
+}
+
+unsigned RGWReshard::ReshardWorker::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const
+{
+  return out << "rgw reshard worker thread: ";
+}
diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h

new file mode 100644 (file)

index 0000000..59819f3
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_reshard.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <vector>
+#include <initializer_list>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/asio/basic_waitable_timer.hpp>
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
+#include "common/async/yield_context.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/lock/cls_lock_client.h"
+
+#include "rgw_common.h"
+#include "common/fault_injector.h"
+
+
+class RGWReshard;
+namespace rgw { namespace sal {
+  class RadosStore;
+} }
+
+using ReshardFaultInjector = FaultInjector<std::string_view>;
+
+class RGWBucketReshardLock {
+  using Clock = ceph::coarse_mono_clock;
+
+  rgw::sal::RadosStore* store;
+  const std::string lock_oid;
+  const bool ephemeral;
+  rados::cls::lock::Lock internal_lock;
+  std::chrono::seconds duration;
+
+  Clock::time_point start_time;
+  Clock::time_point renew_thresh;
+
+  void reset_time(const Clock::time_point& now) {
+    start_time = now;
+    renew_thresh = start_time + duration / 2;
+  }
+
+public:
+  RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+                      const std::string& reshard_lock_oid,
+                      bool _ephemeral);
+  RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+                      const RGWBucketInfo& bucket_info,
+                      bool _ephemeral) :
+    RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+  {}
+
+  int lock(const DoutPrefixProvider *dpp);
+  void unlock();
+  int renew(const Clock::time_point&);
+
+  bool should_renew(const Clock::time_point& now) const {
+    return now >= renew_thresh;
+  }
+}; // class RGWBucketReshardLock
+
+class RGWBucketReshard {
+ public:
+  using Clock = ceph::coarse_mono_clock;
+
+ private:
+  rgw::sal::RadosStore *store;
+  RGWBucketInfo bucket_info;
+  std::map<std::string, bufferlist> bucket_attrs;
+
+  RGWBucketReshardLock reshard_lock;
+  RGWBucketReshardLock* outer_reshard_lock;
+
+  // using an initializer_list as an array in contiguous memory
+  // allocated in at once
+  static const std::initializer_list<uint16_t> reshard_primes;
+
+  int do_reshard(const rgw::bucket_index_layout_generation& current,
+                 const rgw::bucket_index_layout_generation& target,
+                 int max_entries,
+                 bool verbose,
+                 std::ostream *os,
+                Formatter *formatter,
+                 const DoutPrefixProvider *dpp);
+public:
+
+  // pass nullptr for the final parameter if no outer reshard lock to
+  // manage
+  RGWBucketReshard(rgw::sal::RadosStore* _store,
+                  const RGWBucketInfo& _bucket_info,
+                  const std::map<std::string, bufferlist>& _bucket_attrs,
+                  RGWBucketReshardLock* _outer_reshard_lock);
+  int execute(int num_shards, ReshardFaultInjector& f,
+              int max_op_entries, const DoutPrefixProvider *dpp,
+              bool verbose = false, std::ostream *out = nullptr,
+              ceph::Formatter *formatter = nullptr,
+             RGWReshard *reshard_log = nullptr);
+  int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
+  int cancel(const DoutPrefixProvider* dpp);
+
+  static int clear_resharding(rgw::sal::RadosStore* store,
+                             RGWBucketInfo& bucket_info,
+                             std::map<std::string, bufferlist>& bucket_attrs,
+                              const DoutPrefixProvider* dpp);
+
+  static uint32_t get_max_prime_shards() {
+    return *std::crbegin(reshard_primes);
+  }
+
+  // returns the prime in our list less than or equal to the
+  // parameter; the lowest value that can be returned is 1
+  static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
+    auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
+                              requested_shards);
+    if (it == reshard_primes.begin()) {
+      return 1;
+    } else {
+      return *(--it);
+    }
+  }
+
+  // returns the prime in our list greater than or equal to the
+  // parameter; if we do not have such a prime, 0 is returned
+  static uint32_t get_prime_shards_greater_or_equal(
+    uint32_t requested_shards)
+  {
+    auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
+                              requested_shards);
+    if (it == reshard_primes.end()) {
+      return 0;
+    } else {
+      return *it;
+    }
+  }
+
+  // returns a preferred number of shards given a calculated number of
+  // shards based on max_dynamic_shards and the list of prime values
+  static uint32_t get_preferred_shards(uint32_t suggested_shards,
+                                      uint32_t max_dynamic_shards) {
+
+    // use a prime if max is within our prime range, otherwise use
+    // specified max
+    const uint32_t absolute_max =
+      max_dynamic_shards >= get_max_prime_shards() ?
+      max_dynamic_shards :
+      get_prime_shards_less_or_equal(max_dynamic_shards);
+
+    // if we can use a prime number, use it, otherwise use suggested;
+    // note get_prime_shards_greater_or_equal will return 0 if no prime in
+    // prime range
+    const uint32_t prime_ish_num_shards =
+      std::max(get_prime_shards_greater_or_equal(suggested_shards),
+              suggested_shards);
+
+    // dynamic sharding cannot reshard more than defined maximum
+    const uint32_t final_num_shards =
+      std::min(prime_ish_num_shards, absolute_max);
+
+    return final_num_shards;
+  }
+
+  const std::map<std::string, bufferlist>& get_bucket_attrs() const {
+    return bucket_attrs;
+  }
+
+  // for multisite, the RGWBucketInfo keeps a history of old log generations
+  // until all peers are done with them. prevent this log history from growing
+  // too large by refusing to reshard the bucket until the old logs get trimmed
+  static constexpr size_t max_bilog_history = 4;
+
+  static bool can_reshard(const RGWBucketInfo& bucket,
+                          const RGWSI_Zone* zone_svc);
+}; // RGWBucketReshard
+
+
+class RGWReshard {
+public:
+    using Clock = ceph::coarse_mono_clock;
+
+private:
+    rgw::sal::RadosStore* store;
+    std::string lock_name;
+    rados::cls::lock::Lock instance_lock;
+    int num_logshards;
+
+    bool verbose;
+    std::ostream *out;
+    Formatter *formatter;
+
+    void get_logshard_oid(int shard_num, std::string *shard);
+protected:
+  class ReshardWorker : public Thread, public DoutPrefixProvider {
+    CephContext *cct;
+    RGWReshard *reshard;
+    ceph::mutex lock = ceph::make_mutex("ReshardWorker");
+    ceph::condition_variable cond;
+
+  public:
+    ReshardWorker(CephContext * const _cct,
+                 RGWReshard * const _reshard)
+      : cct(_cct),
+        reshard(_reshard) {}
+
+    void *entry() override;
+    void stop();
+
+    CephContext *get_cct() const override;
+    unsigned get_subsys() const override;
+    std::ostream& gen_prefix(std::ostream& out) const override;
+  };
+
+  ReshardWorker *worker = nullptr;
+  std::atomic<bool> down_flag = { false };
+
+  std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name);
+  void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid);
+
+public:
+  RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr);
+  int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+  int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info);
+  int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+  int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry);
+  int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
+  int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
+
+  /* reshard thread */
+  int process_entry(const cls_rgw_reshard_entry& entry, int max_entries,
+                    const DoutPrefixProvider *dpp);
+  int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
+  int process_all_logshards(const DoutPrefixProvider *dpp);
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+};
+
+class RGWReshardWait {
+ public:
+  // the blocking wait uses std::condition_variable::wait_for(), which uses the
+  // std::chrono::steady_clock. use that for the async waits as well
+  using Clock = std::chrono::steady_clock;
+ private:
+  const ceph::timespan duration;
+  ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
+  ceph::condition_variable cond;
+
+  struct Waiter : boost::intrusive::list_base_hook<> {
+    using Executor = boost::asio::io_context::executor_type;
+    using Timer = boost::asio::basic_waitable_timer<Clock,
+          boost::asio::wait_traits<Clock>, Executor>;
+    Timer timer;
+    explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
+  };
+  boost::intrusive::list<Waiter> waiters;
+
+  bool going_down{false};
+
+public:
+  RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
+    : duration(duration) {}
+  ~RGWReshardWait() {
+    ceph_assert(going_down);
+  }
+  int wait(optional_yield y);
+  // unblock any threads waiting on reshard
+  void stop();
+};
diff --git a/src/rgw/driver/rados/rgw_rest_bucket.cc b/src/rgw/driver/rados/rgw_rest_bucket.cc

new file mode 100644 (file)

index 0000000..ebe4e42
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_bucket.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_op.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWOp_Bucket_Info : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Info() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_bucket_info"; }
+};
+
+void RGWOp_Bucket_Info::execute(optional_yield y)
+{
+  RGWBucketAdminOpState op_state;
+
+  bool fetch_stats;
+
+  std::string bucket;
+
+  string uid_str;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket);
+  op_state.set_fetch_stats(fetch_stats);
+
+  op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this);
+}
+
+class RGWOp_Get_Policy : public RGWRESTOp {
+
+public:
+  RGWOp_Get_Policy() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_policy"; }
+};
+
+void RGWOp_Get_Policy::execute(optional_yield y)
+{
+  RGWBucketAdminOpState op_state;
+
+  std::string bucket;
+  std::string object;
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "object", object, &object);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_object(object);
+
+  op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this);
+}
+
+class RGWOp_Check_Bucket_Index : public RGWRESTOp {
+
+public:
+  RGWOp_Check_Bucket_Index() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "check_bucket_index"; }
+};
+
+void RGWOp_Check_Bucket_Index::execute(optional_yield y)
+{
+  std::string bucket;
+
+  bool fix_index;
+  bool check_objects;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_bool(s, "fix", false, &fix_index);
+  RESTArgs::get_bool(s, "check-objects", false, &check_objects);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_fix_index(fix_index);
+  op_state.set_check_objects(check_objects);
+
+  op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s);
+}
+
+class RGWOp_Bucket_Link : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Link() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "link_bucket"; }
+};
+
+void RGWOp_Bucket_Link::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string bucket;
+  std::string bucket_id;
+  std::string new_bucket_name;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
+  RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name);
+
+  rgw_user uid(uid_str);
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket);
+  op_state.set_bucket_id(bucket_id);
+  op_state.set_new_bucket_name(new_bucket_name);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWBucketAdminOp::link(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Unlink : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Unlink() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "unlink_bucket"; }
+};
+
+void RGWOp_Bucket_Unlink::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string bucket;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWBucketAdminOp::unlink(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_bucket"; }
+};
+
+void RGWOp_Bucket_Remove::execute(optional_yield y)
+{
+  std::string bucket_name;
+  bool delete_children;
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+  RESTArgs::get_bool(s, "purge-objects", false, &delete_children);
+
+  /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to
+   * the master.  This user is actually the OP caller, not the bucket owner. */
+  op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl;
+    if (op_ret == -ENOENT) {
+      op_ret = -ERR_NO_SUCH_BUCKET;
+    }
+    return;
+  }
+
+  op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield);
+}
+
+class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
+
+public:
+  RGWOp_Set_Bucket_Quota() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "set_bucket_quota"; }
+};
+
+#define QUOTA_INPUT_MAX_LEN 1024
+
+void RGWOp_Set_Bucket_Quota::execute(optional_yield y)
+{
+  bool uid_arg_existed = false;
+  std::string uid_str;
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
+  if (! uid_arg_existed) {
+    op_ret = -EINVAL;
+    return;
+  }
+  rgw_user uid(uid_str);
+  bool bucket_arg_existed = false;
+  std::string bucket_name;
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed);
+  if (! bucket_arg_existed) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  bool use_http_params;
+
+  if (s->content_length > 0) {
+    use_http_params = false;
+  } else {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+  }
+  RGWQuotaInfo quota;
+  if (!use_http_params) {
+    bool empty;
+    op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+    if (op_ret < 0) {
+      if (!empty)
+        return;
+      /* was probably chunked input, but no content provided, configure via http params */
+      use_http_params = true;
+    }
+  }
+  if (use_http_params) {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield);
+    if (op_ret < 0) {
+      return;
+    }
+    RGWQuotaInfo *old_quota = &bucket->get_info().quota;
+    int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
+    int64_t max_size_kb;
+    bool has_max_size_kb = false;
+    RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
+    RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+    RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb);
+    if (has_max_size_kb)
+      quota.max_size = max_size_kb * 1024;
+    RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
+  }
+
+  RGWBucketAdminOpState op_state;
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket_name);
+  op_state.set_quota(quota);
+
+  op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s);
+}
+
+class RGWOp_Sync_Bucket : public RGWRESTOp {
+
+public:
+  RGWOp_Sync_Bucket() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "sync_bucket"; }
+};
+
+void RGWOp_Sync_Bucket::execute(optional_yield y)
+{
+  std::string bucket;
+  std::string tenant;
+  bool sync_bucket;
+
+  RGWBucketAdminOpState op_state;
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "tenant", tenant, &tenant);
+  RESTArgs::get_bool(s, "sync", true, &sync_bucket);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_tenant(tenant);
+  op_state.set_sync_bucket(sync_bucket);
+
+  op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s);
+}
+
+class RGWOp_Object_Remove: public RGWRESTOp {
+
+public:
+  RGWOp_Object_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_object"; }
+};
+
+void RGWOp_Object_Remove::execute(optional_yield y)
+{
+  std::string bucket;
+  std::string object;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "object", object, &object);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_object(object);
+
+  op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s);
+}
+
+
+RGWOp *RGWHandler_Bucket::op_get()
+{
+
+  if (s->info.args.sub_resource_exists("policy"))
+    return new RGWOp_Get_Policy;
+
+  if (s->info.args.sub_resource_exists("index"))
+    return new RGWOp_Check_Bucket_Index;
+
+  return new RGWOp_Bucket_Info;
+}
+
+RGWOp *RGWHandler_Bucket::op_put()
+{
+  if (s->info.args.sub_resource_exists("quota"))
+    return new RGWOp_Set_Bucket_Quota;
+
+  if (s->info.args.sub_resource_exists("sync"))
+    return new RGWOp_Sync_Bucket;
+  
+  return new RGWOp_Bucket_Link;
+}
+
+RGWOp *RGWHandler_Bucket::op_post()
+{
+  return new RGWOp_Bucket_Unlink;
+}
+
+RGWOp *RGWHandler_Bucket::op_delete()
+{
+  if (s->info.args.sub_resource_exists("object"))
+    return new RGWOp_Object_Remove;
+
+  return new RGWOp_Bucket_Remove;
+}
diff --git a/src/rgw/driver/rados/rgw_rest_bucket.h b/src/rgw/driver/rados/rgw_rest_bucket.h

new file mode 100644 (file)

index 0000000..00f0b64
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_bucket.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_Bucket : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_put() override;
+  RGWOp *op_post() override;
+  RGWOp *op_delete() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Bucket() override = default;
+
+  int read_permissions(RGWOp*, optional_yield y) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_Bucket : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Bucket() = default;
+  ~RGWRESTMgr_Bucket() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+                              req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Bucket(auth_registry);
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc

new file mode 100644 (file)

index 0000000..3563cf0
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_log.cc
@@ -0,0 +1,1267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ceph_json.h"
+#include "common/strtol.h"
+#include "rgw_rest.h"
+#include "rgw_op.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_log.h"
+#include "rgw_client_io.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_trim_bilog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_bilog_rados.h"
+
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define LOG_CLASS_LIST_MAX_ENTRIES (1000)
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWOp_MDLog_List::execute(optional_yield y) {
+  string   period = s->info.args.get("period");
+  string   shard = s->info.args.get("id");
+  string   max_entries_str = s->info.args.get("max-entries");
+  string   marker = s->info.args.get("marker"),
+           err;
+  void    *handle;
+  unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!max_entries_str.empty()) {
+    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+      max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+    }
+  }
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+    if (period.empty()) {
+      ldpp_dout(this, 5) << "Missing period id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+  meta_log.init_list_entries(shard_id, {}, {}, marker, &handle);
+
+  op_ret = meta_log.list_entries(this, handle, max_entries, entries,
+                                   &last_marker, &truncated);
+
+  meta_log.complete_list_entries(handle);
+}
+
+void RGWOp_MDLog_List::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section("log_entries");
+  s->formatter->dump_string("marker", last_marker);
+  s->formatter->dump_bool("truncated", truncated);
+  {
+    s->formatter->open_array_section("entries");
+    for (list<cls_log_entry>::iterator iter = entries.begin();
+        iter != entries.end(); ++iter) {
+      cls_log_entry& entry = *iter;
+      static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter);
+      flusher.flush();
+    }
+    s->formatter->close_section();
+  }
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+void RGWOp_MDLog_Info::execute(optional_yield y) {
+  num_objects = s->cct->_conf->rgw_md_log_max_shards;
+  period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->read_oldest_log_period(y, s);
+  op_ret = period.get_error();
+}
+
+void RGWOp_MDLog_Info::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  s->formatter->open_object_section("mdlog");
+  s->formatter->dump_unsigned("num_objects", num_objects);
+  if (period) {
+    s->formatter->dump_string("period", period.get_period().get_id());
+    s->formatter->dump_unsigned("realm_epoch", period.get_epoch());
+  }
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+void RGWOp_MDLog_ShardInfo::execute(optional_yield y) {
+  string period = s->info.args.get("period");
+  string shard = s->info.args.get("id");
+  string err;
+
+  unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+
+    if (period.empty()) {
+      ldpp_dout(this, 5) << "Missing period id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+  op_ret = meta_log.get_info(this, shard_id, &info);
+}
+
+void RGWOp_MDLog_ShardInfo::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  encode_json("info", info, s->formatter);
+  flusher.flush();
+}
+
+void RGWOp_MDLog_Delete::execute(optional_yield y) {
+  string   marker = s->info.args.get("marker"),
+           period = s->info.args.get("period"),
+           shard = s->info.args.get("id"),
+           err;
+  unsigned shard_id;
+
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("start-marker")) {
+    ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("end-marker")) {
+    if (!s->info.args.exists("marker")) {
+      marker = s->info.args.get("end-marker");
+    } else {
+      ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+      op_ret = -EINVAL;
+    }
+  }
+
+  op_ret = 0;
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (marker.empty()) { /* bounding end */
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+
+    if (period.empty()) {
+      ldpp_dout(this, 5) << "Missing period id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+  op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker);
+}
+
+void RGWOp_MDLog_Lock::execute(optional_yield y) {
+  string period, shard_id_str, duration_str, locker_id, zone_id;
+  unsigned shard_id;
+
+  op_ret = 0;
+
+  period       = s->info.args.get("period");
+  shard_id_str = s->info.args.get("id");
+  duration_str = s->info.args.get("length");
+  locker_id    = s->info.args.get("locker-id");
+  zone_id      = s->info.args.get("zone-id");
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+  }
+
+  if (period.empty() ||
+      shard_id_str.empty() ||
+      (duration_str.empty()) ||
+      locker_id.empty() ||
+      zone_id.empty()) {
+    ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+  unsigned dur;
+  dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err);
+  if (!err.empty() || dur <= 0) {
+    ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id,
+                                    locker_id);
+  if (op_ret == -EBUSY)
+    op_ret = -ERR_LOCKED;
+}
+
+void RGWOp_MDLog_Unlock::execute(optional_yield y) {
+  string period, shard_id_str, locker_id, zone_id;
+  unsigned shard_id;
+
+  op_ret = 0;
+
+  period       = s->info.args.get("period");
+  shard_id_str = s->info.args.get("id");
+  locker_id    = s->info.args.get("locker-id");
+  zone_id      = s->info.args.get("zone-id");
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+  }
+
+  if (period.empty() ||
+      shard_id_str.empty() ||
+      locker_id.empty() ||
+      zone_id.empty()) {
+    ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+  op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id);
+}
+
+void RGWOp_MDLog_Notify::execute(optional_yield y) {
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+  if (r < 0) {
+    op_ret = r;
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+  JSONParser p;
+  r = p.parse(buf, data.length());
+  if (r < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+    op_ret = r;
+    return;
+  }
+
+  set<int> updated_shards;
+  try {
+    decode_json_obj(updated_shards, &p);
+  } catch (JSONDecoder::err& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (set<int>::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl;
+    }
+  }
+
+  driver->wakeup_meta_sync_shards(updated_shards);
+
+  op_ret = 0;
+}
+
+void RGWOp_BILog_List::execute(optional_yield y) {
+  bool gen_specified = false;
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
+         marker = s->info.args.get("marker"),
+         max_entries_str = s->info.args.get("max-entries"),
+         bucket_instance = s->info.args.get("bucket-instance"),
+         gen_str = s->info.args.get("generation", &gen_specified),
+         format_version_str = s->info.args.get("format-ver");
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+  unsigned max_entries;
+
+  if (bucket_name.empty() && bucket_instance.empty()) {
+    ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  std::optional<uint64_t> gen;
+  if (gen_specified) {
+    gen = strict_strtoll(gen_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  if (!format_version_str.empty()) {
+    format_ver = strict_strtoll(format_version_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  int shard_id;
+  string bn;
+  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!bucket_instance.empty()) {
+    b.name = bn;
+    b.bucket_id = bucket_instance;
+  }
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+    return;
+  }
+
+  const auto& logs = bucket->get_info().layout.logs;
+  if (logs.empty()) {
+    ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+
+  auto log = std::prev(logs.end());
+  if (gen) {
+    log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
+    if (log == logs.end()) {
+      ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl;
+      op_ret = -ENOENT;
+      return;
+    }
+  }
+  if (auto next = std::next(log); next != logs.end()) {
+    next_log_layout = *next;   // get the next log after the current latest
+  }
+  auto& log_layout = *log; // current log layout for log listing
+
+  unsigned count = 0;
+
+
+  max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+  if (!err.empty())
+    max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+  send_response();
+  do {
+    list<rgw_bi_log_entry> entries;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id,
+                                               marker, max_entries - count,
+                                               entries, &truncated);
+    if (ret < 0) {
+      ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl;
+      return;
+    }
+
+    count += entries.size();
+
+    send_response(entries, marker);
+  } while (truncated && count < max_entries);
+
+  send_response_end();
+}
+
+void RGWOp_BILog_List::send_response() {
+  if (sent_header)
+    return;
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  sent_header = true;
+
+  if (op_ret < 0)
+    return;
+
+  if (format_ver >= 2) {
+    s->formatter->open_object_section("result");
+  }
+
+  s->formatter->open_array_section("entries");
+}
+
+void RGWOp_BILog_List::send_response(list<rgw_bi_log_entry>& entries, string& marker)
+{
+  for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+    rgw_bi_log_entry& entry = *iter;
+    encode_json("entry", entry, s->formatter);
+
+    marker = entry.id;
+    flusher.flush();
+  }
+}
+
+void RGWOp_BILog_List::send_response_end() {
+  s->formatter->close_section();
+
+  if (format_ver >= 2) {
+    encode_json("truncated", truncated, s->formatter);
+
+    if (next_log_layout) {
+      s->formatter->open_object_section("next_log");
+      encode_json("generation", next_log_layout->gen, s->formatter);
+      encode_json("num_shards", next_log_layout->layout.in_index.layout.num_shards, s->formatter);
+      s->formatter->close_section(); // next_log
+    }
+
+    s->formatter->close_section(); // result
+  }
+
+  flusher.flush();
+}
+
+void RGWOp_BILog_Info::execute(optional_yield y) {
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
+         bucket_instance = s->info.args.get("bucket-instance");
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+  if (bucket_name.empty() && bucket_instance.empty()) {
+    ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  int shard_id;
+  string bn;
+  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!bucket_instance.empty()) {
+    b.name = bn;
+    b.bucket_id = bucket_instance;
+  }
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+    return;
+  }
+
+  const auto& logs = bucket->get_info().layout.logs;
+  if (logs.empty()) {
+    ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+
+  map<RGWObjCategory, RGWStorageStats> stats;
+  const auto& index = log_to_index_layout(logs.back());
+
+  int ret =  bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
+  if (ret < 0 && ret != -ENOENT) {
+    op_ret = ret;
+    return;
+  }
+
+  oldest_gen = logs.front().gen;
+  latest_gen = logs.back().gen;
+
+  for (auto& log : logs) {
+      uint32_t num_shards = log.layout.in_index.layout.num_shards;
+      generations.push_back({log.gen, num_shards});
+  }
+}
+
+void RGWOp_BILog_Info::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section("info");
+  encode_json("bucket_ver", bucket_ver, s->formatter);
+  encode_json("master_ver", master_ver, s->formatter);
+  encode_json("max_marker", max_marker, s->formatter);
+  encode_json("syncstopped", syncstopped, s->formatter);
+  encode_json("oldest_gen", oldest_gen, s->formatter);
+  encode_json("latest_gen", latest_gen, s->formatter);
+  encode_json("generations", generations, s->formatter);
+  s->formatter->close_section();
+
+  flusher.flush();
+}
+
+void RGWOp_BILog_Delete::execute(optional_yield y) {
+  bool gen_specified = false;
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
+         start_marker = s->info.args.get("start-marker"),
+         end_marker = s->info.args.get("end-marker"),
+         bucket_instance = s->info.args.get("bucket-instance"),
+        gen_str = s->info.args.get("generation", &gen_specified);
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+  op_ret = 0;
+  if ((bucket_name.empty() && bucket_instance.empty()) ||
+      end_marker.empty()) {
+    ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  uint64_t gen = 0;
+  if (gen_specified) {
+    gen = strict_strtoll(gen_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  int shard_id;
+  string bn;
+  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!bucket_instance.empty()) {
+    b.name = bn;
+    b.bucket_id = bucket_instance;
+  }
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+    return;
+  }
+
+  op_ret = bilog_trim(this, static_cast<rgw::sal::RadosStore*>(driver),
+                     bucket->get_info(), gen, shard_id,
+                     start_marker, end_marker);
+  if (op_ret < 0) {
+    ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl;
+  }
+
+  return;
+}
+
+void RGWOp_DATALog_List::execute(optional_yield y) {
+  string   shard = s->info.args.get("id");
+
+  string   max_entries_str = s->info.args.get("max-entries"),
+           marker = s->info.args.get("marker"),
+           err;
+  unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  s->info.args.get_bool("extra-info", &extra_info, false);
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!max_entries_str.empty()) {
+    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+      max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+    }
+  }
+
+  // Note that last_marker is updated to be the marker of the last
+  // entry listed
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->list_entries(this, shard_id,
+                                                    max_entries, entries,
+                                                    marker, &last_marker,
+                                                    &truncated);
+}
+
+void RGWOp_DATALog_List::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section("log_entries");
+  s->formatter->dump_string("marker", last_marker);
+  s->formatter->dump_bool("truncated", truncated);
+  {
+    s->formatter->open_array_section("entries");
+    for (const auto& entry : entries) {
+      if (!extra_info) {
+        encode_json("entry", entry.entry, s->formatter);
+      } else {
+        encode_json("entry", entry, s->formatter);
+      }
+      flusher.flush();
+    }
+    s->formatter->close_section();
+  }
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+
+void RGWOp_DATALog_Info::execute(optional_yield y) {
+  num_objects = s->cct->_conf->rgw_data_log_num_shards;
+  op_ret = 0;
+}
+
+void RGWOp_DATALog_Info::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  s->formatter->open_object_section("num_objects");
+  s->formatter->dump_unsigned("num_objects", num_objects);
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+void RGWOp_DATALog_ShardInfo::execute(optional_yield y) {
+  string shard = s->info.args.get("id");
+  string err;
+
+  unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->get_info(this, shard_id, &info);
+}
+
+void RGWOp_DATALog_ShardInfo::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  encode_json("info", info, s->formatter);
+  flusher.flush();
+}
+
+void RGWOp_DATALog_Notify::execute(optional_yield y) {
+  string  source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+  if (r < 0) {
+    op_ret = r;
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+  JSONParser p;
+  r = p.parse(buf, data.length());
+  if (r < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+    op_ret = r;
+    return;
+  }
+
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> updated_shards;
+  try {
+    auto decoder = rgw_data_notify_v1_decoder{updated_shards};
+    decode_json_obj(decoder, &p);
+  } catch (JSONDecoder::err& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+      bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+      for (const auto& [key, gen] : entries) {
+        ldpp_dout(this, 20) << __func__ << "(): modified key=" << key
+        << " of gen=" << gen << dendl;
+      }
+    }
+  }
+
+  driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+  op_ret = 0;
+}
+
+void RGWOp_DATALog_Notify2::execute(optional_yield y) {
+  string  source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF);
+  if (r < 0) {
+    op_ret = r;
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl;
+
+  JSONParser p;
+  r = p.parse(buf, data.length());
+  if (r < 0) {
+    ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl;
+    op_ret = r;
+    return;
+  }
+
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> > updated_shards;
+  try {
+    decode_json_obj(updated_shards, &p);
+  } catch (JSONDecoder::err& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter =
+        updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+      bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+      for (const auto& [key, gen] : entries) {
+        ldpp_dout(this, 20) << __func__ << "(): modified key=" << key <<
+        " of generation=" << gen << dendl;
+      }
+    }
+  }
+
+  driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+  op_ret = 0;
+}
+
+void RGWOp_DATALog_Delete::execute(optional_yield y) {
+  string   marker = s->info.args.get("marker"),
+           shard = s->info.args.get("id"),
+           err;
+  unsigned shard_id;
+
+  op_ret = 0;
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("start-marker")) {
+    ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("end-marker")) {
+    if (!s->info.args.exists("marker")) {
+      marker = s->info.args.get("end-marker");
+    } else {
+      ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+      op_ret = -EINVAL;
+    }
+  }
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  if (marker.empty()) { /* bounding end */
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->trim_entries(this, shard_id, marker);
+}
+
+// not in header to avoid pulling in rgw_sync.h
+class RGWOp_MDLog_Status : public RGWRESTOp {
+  rgw_meta_sync_status status;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "get_metadata_log_status"; }
+};
+
+void RGWOp_MDLog_Status::execute(optional_yield y)
+{
+  auto sync = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_meta_sync_manager();
+  if (sync == nullptr) {
+    ldpp_dout(this, 1) << "no sync manager" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+  op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_MDLog_Status::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret >= 0) {
+    encode_json("status", status, s->formatter);
+  }
+  flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_BILog_Status : public RGWRESTOp {
+  bilog_status_v2 status;
+  int version = 1;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "get_bucket_index_log_status"; }
+};
+
+void RGWOp_BILog_Status::execute(optional_yield y)
+{
+  const auto options = s->info.args.get("options");
+  bool merge = (options == "merge");
+  const auto source_zone = s->info.args.get("source-zone");
+  const auto source_key = s->info.args.get("source-bucket");
+  auto key = s->info.args.get("bucket");
+  op_ret = s->info.args.get_int("version", &version, 1);
+
+  if (key.empty()) {
+    key = source_key;
+  }
+  if (key.empty()) {
+    ldpp_dout(this, 4) << "no 'bucket' provided" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  rgw_bucket b;
+  int shard_id{-1}; // unused
+  op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id);
+  if (op_ret < 0) {
+    ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  // read the bucket instance info for num_shards
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl;
+    return;
+  }
+
+  rgw_bucket source_bucket;
+
+  if (source_key.empty() ||
+      source_key == key) {
+    source_bucket = bucket->get_key();
+  } else {
+    op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr);
+    if (op_ret < 0) {
+      ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl;
+      return;
+    }
+  }
+
+  const auto& local_zone_id = driver->get_zone()->get_id();
+
+  if (!merge) {
+    rgw_sync_bucket_pipe pipe;
+    pipe.source.zone = source_zone;
+    pipe.source.bucket = source_bucket;
+    pipe.dest.zone = local_zone_id;
+    pipe.dest.bucket = bucket->get_key();
+
+    ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+    op_ret = rgw_read_bucket_full_sync_status(
+      this,
+      static_cast<rgw::sal::RadosStore*>(driver),
+      pipe,
+      &status.sync_status,
+      s->yield);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+      return;
+    }
+    status.inc_status.resize(status.sync_status.shards_done_with_gen.size());
+
+    op_ret = rgw_read_bucket_inc_sync_status(
+      this,
+      static_cast<rgw::sal::RadosStore*>(driver),
+      pipe,
+      status.sync_status.incremental_gen,
+      &status.inc_status);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+    }
+    return;
+  }
+
+  rgw_zone_id source_zone_id(source_zone);
+
+  RGWBucketSyncPolicyHandlerRef source_handler;
+  op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl;
+    return;
+  }
+
+  auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id);
+
+  std::vector<rgw_bucket_shard_sync_info> current_status;
+  for (auto& entry : local_dests) {
+    auto pipe = entry.second;
+
+    ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+    RGWBucketInfo *pinfo = &bucket->get_info();
+    std::optional<RGWBucketInfo> opt_dest_info;
+
+    if (!pipe.dest.bucket) {
+      /* Uh oh, something went wrong */
+      ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl;
+      op_ret = -EIO;
+      return;
+    }
+
+    if (*pipe.dest.bucket != pinfo->bucket) {
+      opt_dest_info.emplace();
+      std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+      op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y);
+      if (op_ret < 0) {
+        ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl;
+        return;
+      }
+
+      *opt_dest_info = dest_bucket->get_info();
+      pinfo = &(*opt_dest_info);
+      pipe.dest.bucket = pinfo->bucket;
+    }
+
+    op_ret = rgw_read_bucket_full_sync_status(
+      this,
+      static_cast<rgw::sal::RadosStore*>(driver),
+      pipe,
+      &status.sync_status,
+      s->yield);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+      return;
+    }
+
+    current_status.resize(status.sync_status.shards_done_with_gen.size());
+    int r = rgw_read_bucket_inc_sync_status(this, static_cast<rgw::sal::RadosStore*>(driver),
+                                           pipe, status.sync_status.incremental_gen, &current_status);
+    if (r < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl;
+      op_ret = r;
+      return;
+    }
+
+    if (status.inc_status.empty()) {
+      status.inc_status = std::move(current_status);
+    } else {
+      if (current_status.size() != status.inc_status.size()) {
+        op_ret = -EINVAL;
+        ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets "
+         "syncing from the same source: status.size()= "
+                           << status.inc_status.size()
+                           << " current_status.size()="
+                           << current_status.size() << dendl;
+       return;
+      }
+      auto m = status.inc_status.begin();
+      for (auto& cur_shard_status : current_status) {
+        auto& result_shard_status = *m++;
+        // always take the first marker, or any later marker that's smaller
+        if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) {
+          result_shard_status = std::move(cur_shard_status);
+        }
+      }
+    }
+  }
+}
+
+void RGWOp_BILog_Status::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret >= 0) {
+    if (version < 2) {
+      encode_json("status", status.inc_status, s->formatter);
+    } else {
+      encode_json("status", status, s->formatter);
+    }
+  }
+  flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_DATALog_Status : public RGWRESTOp {
+  rgw_data_sync_status status;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override ;
+  void send_response() override;
+  const char* name() const override { return "get_data_changes_log_status"; }
+};
+
+void RGWOp_DATALog_Status::execute(optional_yield y)
+{
+  const auto source_zone = s->info.args.get("source-zone");
+  auto sync = driver->get_data_sync_manager(source_zone);
+  if (sync == nullptr) {
+    ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+  op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_DATALog_Status::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret >= 0) {
+    encode_json("status", status, s->formatter);
+  }
+  flusher.flush();
+}
+
+
+RGWOp *RGWHandler_Log::op_get() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (!exists) {
+    return NULL;
+  }
+
+  if (type.compare("metadata") == 0) {
+    if (s->info.args.exists("id")) {
+      if (s->info.args.exists("info")) {
+        return new RGWOp_MDLog_ShardInfo;
+      } else {
+        return new RGWOp_MDLog_List;
+      }
+    } else if (s->info.args.exists("status")) {
+      return new RGWOp_MDLog_Status;
+    } else {
+      return new RGWOp_MDLog_Info;
+    }
+  } else if (type.compare("bucket-index") == 0) {
+    if (s->info.args.exists("info")) {
+      return new RGWOp_BILog_Info;
+    } else if (s->info.args.exists("status")) {
+      return new RGWOp_BILog_Status;
+    } else {
+      return new RGWOp_BILog_List;
+    }
+  } else if (type.compare("data") == 0) {
+    if (s->info.args.exists("id")) {
+      if (s->info.args.exists("info")) {
+        return new RGWOp_DATALog_ShardInfo;
+      } else {
+        return new RGWOp_DATALog_List;
+      }
+    } else if (s->info.args.exists("status")) {
+      return new RGWOp_DATALog_Status;
+    } else {
+      return new RGWOp_DATALog_Info;
+    }
+  }
+  return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_delete() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (!exists) {
+    return NULL;
+  }
+
+  if (type.compare("metadata") == 0)
+    return new RGWOp_MDLog_Delete;
+  else if (type.compare("bucket-index") == 0) 
+    return new RGWOp_BILog_Delete;
+  else if (type.compare("data") == 0)
+    return new RGWOp_DATALog_Delete;
+  return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_post() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (!exists) {
+    return NULL;
+  }
+
+  if (type.compare("metadata") == 0) {
+    if (s->info.args.exists("lock"))
+      return new RGWOp_MDLog_Lock;
+    else if (s->info.args.exists("unlock"))
+      return new RGWOp_MDLog_Unlock;
+    else if (s->info.args.exists("notify"))
+      return new RGWOp_MDLog_Notify;
+  } else if (type.compare("data") == 0) {
+    if (s->info.args.exists("notify")) {
+      return new RGWOp_DATALog_Notify;
+    } else if (s->info.args.exists("notify2")) {
+      return new RGWOp_DATALog_Notify2;
+    }
+  }
+  return NULL;
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_log.h b/src/rgw/driver/rados/rgw_rest_log.h

new file mode 100644 (file)

index 0000000..c8a0c4d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_log.h
@@ -0,0 +1,337 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_datalog.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_metadata.h"
+#include "rgw_mdlog.h"
+#include "rgw_data_sync.h"
+
+class RGWOp_BILog_List : public RGWRESTOp {
+  bool sent_header;
+  uint32_t format_ver{0};
+  bool truncated{false};
+  std::optional<rgw::bucket_log_layout_generation> next_log_layout;
+
+public:
+  RGWOp_BILog_List() : sent_header(false) {}
+  ~RGWOp_BILog_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void send_response() override;
+  virtual void send_response(std::list<rgw_bi_log_entry>& entries, std::string& marker);
+  virtual void send_response_end();
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "list_bucket_index_log";
+  }
+};
+
+class RGWOp_BILog_Info : public RGWRESTOp {
+  std::string bucket_ver;
+  std::string master_ver;
+  std::string max_marker;
+  bool syncstopped;
+  uint64_t oldest_gen = 0;
+  uint64_t latest_gen = 0;
+  std::vector<store_gen_shards> generations;
+
+public:
+  RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
+  ~RGWOp_BILog_Info() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void send_response() override;
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "bucket_index_log_info";
+  }
+};
+
+class RGWOp_BILog_Delete : public RGWRESTOp {
+public:
+  RGWOp_BILog_Delete() {}
+  ~RGWOp_BILog_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "trim_bucket_index_log";
+  }
+};
+
+class RGWOp_MDLog_List : public RGWRESTOp {
+  std::list<cls_log_entry> entries;
+  std::string last_marker;
+  bool truncated;
+public:
+  RGWOp_MDLog_List() : truncated(false) {}
+  ~RGWOp_MDLog_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "list_metadata_log";
+  }
+};
+
+class RGWOp_MDLog_Info : public RGWRESTOp {
+  unsigned num_objects;
+  RGWPeriodHistory::Cursor period;
+public:
+  RGWOp_MDLog_Info() : num_objects(0) {}
+  ~RGWOp_MDLog_Info() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_metadata_log_info";
+  }
+};
+
+class RGWOp_MDLog_ShardInfo : public RGWRESTOp {
+  RGWMetadataLogInfo info;
+public:
+  RGWOp_MDLog_ShardInfo() {}
+  ~RGWOp_MDLog_ShardInfo() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_metadata_log_shard_info";
+  }
+};
+
+class RGWOp_MDLog_Lock : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Lock() {}
+  ~RGWOp_MDLog_Lock() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "lock_mdlog_object";
+  }
+};
+
+class RGWOp_MDLog_Unlock : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Unlock() {}
+  ~RGWOp_MDLog_Unlock() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "unlock_mdlog_object";
+  }
+};
+
+class RGWOp_MDLog_Notify : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Notify() {}
+  ~RGWOp_MDLog_Notify() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "mdlog_notify";
+  }
+  RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; }
+};
+
+class RGWOp_MDLog_Delete : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Delete() {}
+  ~RGWOp_MDLog_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "trim_metadata_log";
+  }
+};
+
+class RGWOp_DATALog_List : public RGWRESTOp {
+  std::vector<rgw_data_change_log_entry> entries;
+  std::string last_marker;
+  bool truncated;
+  bool extra_info;
+public:
+  RGWOp_DATALog_List() : truncated(false), extra_info(false) {}
+  ~RGWOp_DATALog_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "list_data_changes_log";
+  }
+};
+
+class RGWOp_DATALog_Info : public RGWRESTOp {
+  unsigned num_objects;
+public:
+  RGWOp_DATALog_Info() : num_objects(0) {}
+  ~RGWOp_DATALog_Info() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_data_changes_log_info";
+  }
+};
+
+class RGWOp_DATALog_ShardInfo : public RGWRESTOp {
+  RGWDataChangesLogInfo info;
+public:
+  RGWOp_DATALog_ShardInfo() {}
+  ~RGWOp_DATALog_ShardInfo() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_data_changes_log_shard_info";
+  }
+};
+
+class RGWOp_DATALog_Notify : public RGWRESTOp {
+public:
+  RGWOp_DATALog_Notify() {}
+  ~RGWOp_DATALog_Notify() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "datalog_notify";
+  }
+  RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; }
+};
+
+class RGWOp_DATALog_Notify2 : public RGWRESTOp {
+  rgw_data_notify_entry data_notify;
+public:
+  RGWOp_DATALog_Notify2() {}
+  ~RGWOp_DATALog_Notify2() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "datalog_notify2";
+  }
+  RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; }
+};
+
+class RGWOp_DATALog_Delete : public RGWRESTOp {
+public:
+  RGWOp_DATALog_Delete() {}
+  ~RGWOp_DATALog_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "trim_data_changes_log";
+  }
+};
+
+class RGWHandler_Log : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_delete() override;
+  RGWOp *op_post() override;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Log() override = default;
+};
+
+class RGWRESTMgr_Log : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Log() = default;
+  ~RGWRESTMgr_Log() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+                              req_state* const,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefixs) override {
+    return new RGWHandler_Log(auth_registry);
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_service.h b/src/rgw/driver/rados/rgw_service.h

index dc4991388a981674e52b73949966be9fb7ce812b..4c0b8d8421f76132c6bd0c53a5b688085228e5c2 100644 (file)
--- a/src/rgw/driver/rados/rgw_service.h
+++ b/src/rgw/driver/rados/rgw_service.h
@@ -1,9 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SERVICE_H
-#define CEPH_RGW_SERVICE_H
-
+#pragma once
  
  #include <string>
  #include <vector>
@@ -215,5 +213,3 @@ struct RGWCtl {
  
    int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
  };
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_sync.h b/src/rgw/driver/rados/rgw_sync.h

index 8c4e511ae3eeb5c450609b235e159ba65cc06a13..e6c255cc60145eabac23342094d7f15f90225503 100644 (file)
--- a/src/rgw/driver/rados/rgw_sync.h
+++ b/src/rgw/driver/rados/rgw_sync.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SYNC_H
-#define CEPH_RGW_SYNC_H
+#pragma once
  
  #include <atomic>
  
@@ -546,4 +545,3 @@ RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
                                                  uint32_t max_entries,
                                                  rgw_mdlog_shard_data *result);
  
-#endif
diff --git a/src/rgw/driver/rados/rgw_sync_module.h b/src/rgw/driver/rados/rgw_sync_module.h

index 6d974c39a274a62bc3e9216ad9907f325a30b971..494e88608c11ea06e04d963a1f0dd1c0f942948a 100644 (file)
--- a/src/rgw/driver/rados/rgw_sync_module.h
+++ b/src/rgw/driver/rados/rgw_sync_module.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SYNC_MODULE_H
-#define CEPH_RGW_SYNC_MODULE_H
+#pragma once
  
  #include "rgw_common.h"
  #include "rgw_coroutine.h"
@@ -198,5 +197,3 @@ public:
  };
  
  void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager);
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.h b/src/rgw/driver/rados/rgw_sync_module_aws.h

index 48f0145fdf92ee0dcfd3f19fa04aea2be65b906a..92532ff00e7faa39351d5a41825bbdbeff4e9096 100644 (file)
--- a/src/rgw/driver/rados/rgw_sync_module_aws.h
+++ b/src/rgw/driver/rados/rgw_sync_module_aws.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_SYNC_MODULE_AWS_H
-#define RGW_SYNC_MODULE_AWS_H
+#pragma once
  
  #include "rgw_sync_module.h"
  
@@ -107,5 +106,3 @@ class RGWAWSSyncModule : public RGWSyncModule {
    bool supports_data_export() override { return false;}
    int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
  };
-
-#endif /* RGW_SYNC_MODULE_AWS_H */
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.h b/src/rgw/driver/rados/rgw_sync_module_es.h

index 6c0c422c39ccb5841c1fb20698a574b9e99ca7b4..c8c9fcc439c5500d73af9501e7f353ed864046e0 100644 (file)
--- a/src/rgw/driver/rados/rgw_sync_module_es.h
+++ b/src/rgw/driver/rados/rgw_sync_module_es.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SYNC_MODULE_ES_H
-#define CEPH_RGW_SYNC_MODULE_ES_H
+#pragma once
  
  #include "rgw_sync_module.h"
  
@@ -58,5 +57,3 @@ public:
      return true;
    }
  };
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.h b/src/rgw/driver/rados/rgw_sync_module_log.h

index ecf3bb78911eff9a9c21b2e8b3b062631c65750d..ab475959da3743eba0b8779070bfe1279e48d397 100644 (file)
--- a/src/rgw/driver/rados/rgw_sync_module_log.h
+++ b/src/rgw/driver/rados/rgw_sync_module_log.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SYNC_MODULE_LOG_H
-#define CEPH_RGW_SYNC_MODULE_LOG_H
+#pragma once
  
  #include "rgw_sync_module.h"
  
@@ -14,5 +13,3 @@ public:
    }
    int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
  };
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_sync_trace.h b/src/rgw/driver/rados/rgw_sync_trace.h

index 9617dac70dbc9a1ca9453555914b5e022f7cc374..1fcc8bed83012f0bf0c5d06bfa2564718af542e7 100644 (file)
--- a/src/rgw/driver/rados/rgw_sync_trace.h
+++ b/src/rgw/driver/rados/rgw_sync_trace.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SYNC_LOG_H
-#define CEPH_RGW_SYNC_LOG_H
+#pragma once
  
  #include <atomic>
  
@@ -140,6 +139,3 @@ public:
            bufferlist& out) override;
    std::string get_active_names();
  };
-
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h

index 6aeb9b8910058177e3e852e8a33f5c8d38ed4c89..d96912cb866bea9b27c57b74882b101bb4c79f27 100644 (file)
--- a/src/rgw/driver/rados/rgw_tools.h
+++ b/src/rgw/driver/rados/rgw_tools.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_TOOLS_H
-#define CEPH_RGW_TOOLS_H
+#pragma once
  
  #include <string>
  
@@ -274,4 +273,3 @@ void rgw_complete_aio_completion(librados::AioCompletion* c, int r);
  // (Currently providing nullptr will wipe all attributes.)
  
  std::map<std::string, ceph::buffer::list>* no_change_attrs();
-#endif
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.h b/src/rgw/driver/rados/rgw_trim_bilog.h

index 5b9c4cdd7ec1b210b0e8abbd1634d80638205c23..6a11d24766be2085911916b37c0c3141c164b53b 100644 (file)
--- a/src/rgw/driver/rados/rgw_trim_bilog.h
+++ b/src/rgw/driver/rados/rgw_trim_bilog.h
@@ -14,8 +14,7 @@
   * Foundation.  See file COPYING.
   */
  
-#ifndef RGW_SYNC_LOG_TRIM_H
-#define RGW_SYNC_LOG_TRIM_H
+#pragma once
  
  #include <memory>
  #include <string_view>
@@ -120,5 +119,3 @@ WRITE_CLASS_ENCODER(rgw::BucketTrimStatus);
  int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
                RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
                std::string_view start_marker, std::string_view end_marker);
-
-#endif // RGW_SYNC_LOG_TRIM_H
diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h

index 110124cdbc78df0cdaa2b01dbe1c2428c3589fe6..83e3720f71bd29805bc2b466cdae1bc3600d6a6a 100644 (file)
--- a/src/rgw/driver/rados/rgw_user.h
+++ b/src/rgw/driver/rados/rgw_user.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_USER_H
-#define CEPH_RGW_USER_H
+#pragma once
  
  #include <string>
  #include <boost/algorithm/string.hpp>
@@ -882,6 +881,3 @@ class RGWUserMetaHandlerAllocator {
  public:
    static RGWMetadataHandler *alloc(RGWSI_User *user_svc);
  };
-
-
-#endif
diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h

index bede6c7a44812b9961af834ee7202fa359b51b9e..2d69d5f1c7230ecf41b0ca734d764394635d42f2 100644 (file)
--- a/src/rgw/driver/rados/rgw_zone.h
+++ b/src/rgw/driver/rados/rgw_zone.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_ZONE_H
-#define CEPH_RGW_ZONE_H
+#pragma once
  
  #include <ostream>
  #include "rgw_zone_types.h"
@@ -942,5 +941,3 @@ int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
                  sal::ZoneWriter& writer);
  
  } // namespace rgw
-
-#endif
diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h

index aa346a63f13f6721efeb87f65cad963bba2ca174..c520501583b81623ea2ab406f3b0189c311a2f44 100644 (file)
--- a/src/rgw/rgw_acl.h
+++ b/src/rgw/rgw_acl.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_ACL_H
-#define CEPH_RGW_ACL_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -413,5 +412,3 @@ public:
    friend bool operator!=(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs);
  };
  WRITE_CLASS_ENCODER(RGWAccessControlPolicy)
-
-#endif
diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h

index 9521b9f47371c92c87939865c268299dfdec49c3..c234d722b997f3a8a4e08e34ca2e1c31616ddbfd 100644 (file)
--- a/src/rgw/rgw_acl_s3.h
+++ b/src/rgw/rgw_acl_s3.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_ACL_S3_H
-#define CEPH_RGW_ACL_S3_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -114,5 +113,3 @@ class RGWACLXMLParser_S3 : public RGWXMLParser
  public:
    explicit RGWACLXMLParser_S3(CephContext *_cct) : cct(_cct) {}
  };
-
-#endif
diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h

index 8d263e854d260ee3b561921873c7b7a30f1b3c9c..4cb1e4b8f8f57b909b9a365a118daeb7d1549f5c 100644 (file)
--- a/src/rgw/rgw_acl_swift.h
+++ b/src/rgw/rgw_acl_swift.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_ACL_SWIFT_H
-#define CEPH_RGW_ACL_SWIFT_H
+#pragma once
  
  #include <map>
  #include <vector>
@@ -57,4 +56,3 @@ public:
                const std::string& acl_str);
    boost::optional<std::string> to_str() const;
  };
-#endif
diff --git a/src/rgw/rgw_asio_client.h b/src/rgw/rgw_asio_client.h

index a595b0351997ba452472c2bf7dd5f9ead1beebcf..f3e92b7e51c11a20bdc5da058c3f1a310b76b12b 100644 (file)
--- a/src/rgw/rgw_asio_client.h
+++ b/src/rgw/rgw_asio_client.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_ASIO_CLIENT_H
-#define RGW_ASIO_CLIENT_H
+#pragma once
  
  #include <boost/asio/ip/tcp.hpp>
  #include <boost/beast/core.hpp>
@@ -58,5 +57,3 @@ class ClientIO : public io::RestfulClient,
  
  } // namespace asio
  } // namespace rgw
-
-#endif // RGW_ASIO_CLIENT_H
diff --git a/src/rgw/rgw_asio_frontend.h b/src/rgw/rgw_asio_frontend.h

index 940b717b5f2b3a10407b4c42a8c0a5b6f78650ac..2de6f337a9fb34f2162abd6d42d39862bcbc1e69 100644 (file)
--- a/src/rgw/rgw_asio_frontend.h
+++ b/src/rgw/rgw_asio_frontend.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_ASIO_FRONTEND_H
-#define RGW_ASIO_FRONTEND_H
+#pragma once
  
  #include <memory>
  #include "rgw_frontend.h"
@@ -24,5 +23,3 @@ public:
    void pause_for_new_config() override;
    void unpause_with_new_config() override;
  };
-
-#endif // RGW_ASIO_FRONTEND_H
diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h

index b0beb185bac1223ab54d5ce4efed4e751414b1e4..82e0d0c9755a4a5e7277b625bcf57c8fd7517651 100644 (file)
--- a/src/rgw/rgw_auth.h
+++ b/src/rgw/rgw_auth.h
@@ -1,9 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-
-#ifndef CEPH_RGW_AUTH_H
-#define CEPH_RGW_AUTH_H
+#pragma once
  
  #include <functional>
  #include <optional>
@@ -791,5 +789,3 @@ uint32_t rgw_perms_from_aclspec_default_strategy(
    const rgw_user& uid,
    const rgw::auth::Identity::aclspec_t& aclspec,
    const DoutPrefixProvider *dpp);
-
-#endif /* CEPH_RGW_AUTH_H */
diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h

index 08f6d659c90d058cc7417cafd153c5bda41a7dff..9e3818bef071d5b3761e0211bb26eadd0199316b 100644 (file)
--- a/src/rgw/rgw_auth_filters.h
+++ b/src/rgw/rgw_auth_filters.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_AUTH_FILTERS_H
-#define CEPH_RGW_AUTH_FILTERS_H
+#pragma once
  
  #include <type_traits>
  
@@ -301,5 +300,3 @@ SysReqApplier<T> add_sysreq(CephContext* const cct,
  
  } /* namespace auth */
  } /* namespace rgw */
-
-#endif /* CEPH_RGW_AUTH_FILTERS_H */
diff --git a/src/rgw/rgw_auth_keystone.h b/src/rgw/rgw_auth_keystone.h

index 31a4388080a970bc9676af18740c4cb8a1a1a840..f3c9604370b9ac27d66a8aef4f0ebada8158a5a2 100644 (file)
--- a/src/rgw/rgw_auth_keystone.h
+++ b/src/rgw/rgw_auth_keystone.h
@@ -1,9 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-
-#ifndef CEPH_RGW_AUTH_KEYSTONE_H
-#define CEPH_RGW_AUTH_KEYSTONE_H
+#pragma once
  
  #include <string_view>
  #include <utility>
@@ -196,5 +194,3 @@ public:
  }; /* namespace keystone */
  }; /* namespace auth */
  }; /* namespace rgw */
-
-#endif /* CEPH_RGW_AUTH_KEYSTONE_H */
diff --git a/src/rgw/rgw_auth_registry.h b/src/rgw/rgw_auth_registry.h

index 992ee46e81c736faef6471ac4cbfaee00895e607..b9d239aecbd16c4d3919825c274f62c481b10ec5 100644 (file)
--- a/src/rgw/rgw_auth_registry.h
+++ b/src/rgw/rgw_auth_registry.h
@@ -1,9 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-
-#ifndef CEPH_RGW_AUTH_REGISTRY_H
-#define CEPH_RGW_AUTH_REGISTRY_H
+#pragma once
  
  #include <functional>
  #include <memory>
@@ -97,5 +95,3 @@ public:
  
  using rgw_auth_registry_t = rgw::auth::StrategyRegistry;
  using rgw_auth_registry_ptr_t = std::unique_ptr<rgw_auth_registry_t>;
-
-#endif /* CEPH_RGW_AUTH_REGISTRY_H */
diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h

index 2984c051662ddc47e08e81e411d2a042fa27499a..a4471467b860edd6e9b52c0286226eb7ec871804 100644 (file)
--- a/src/rgw/rgw_auth_s3.h
+++ b/src/rgw/rgw_auth_s3.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_AUTH_S3_H
-#define CEPH_RGW_AUTH_S3_H
+#pragma once
  
  #include <array>
  #include <memory>
@@ -645,5 +644,3 @@ get_v2_signature(CephContext*,
  } /* namespace s3 */
  } /* namespace auth */
  } /* namespace rgw */
-
-#endif
diff --git a/src/rgw/rgw_b64.h b/src/rgw/rgw_b64.h

index a1699ef61bcc85db31810832158dd7a58db79857..2948f6f315867f1fc93063a69b8539210c3ce929 100644 (file)
--- a/src/rgw/rgw_b64.h
+++ b/src/rgw/rgw_b64.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_B64_H
-#define RGW_B64_H
+#pragma once
  
  #include <boost/archive/iterators/base64_from_binary.hpp>
  #include <boost/archive/iterators/binary_from_base64.hpp>
@@ -83,5 +82,3 @@ namespace rgw {
      return outstr;
    }
  } /* namespace */
-
-#endif /* RGW_B64_H */
diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h

index 168e8bc63b8168c12fcd30d9dd1aec99281bd827..1ccd160ba7ea927aeed80477d9f665265a996002 100644 (file)
--- a/src/rgw/rgw_basic_types.h
+++ b/src/rgw/rgw_basic_types.h
@@ -18,8 +18,7 @@
   * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
   */
  
-#ifndef CEPH_RGW_BASIC_TYPES_H
-#define CEPH_RGW_BASIC_TYPES_H
+#pragma once
  
  #include <string>
  #include <fmt/format.h>
@@ -283,5 +282,3 @@ struct RGWUploadPartInfo {
    static void generate_test_instances(std::list<RGWUploadPartInfo*>& o);
  };
  WRITE_CLASS_ENCODER(RGWUploadPartInfo)
-
-#endif /* CEPH_RGW_BASIC_TYPES_H */
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h

index cffa6573d63e897c1cb03bd60f89c2cc8daf2a07..e70beb0644620357740b7ce346698ca1261a35dd 100644 (file)
--- a/src/rgw/rgw_cache.h
+++ b/src/rgw/rgw_cache.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGWCACHE_H
-#define CEPH_RGWCACHE_H
+#pragma once
  
  #include <string>
  #include <map>
@@ -221,5 +220,3 @@ public:
    void unchain_cache(RGWChainedCache *cache);
    void invalidate_all();
  };
-
-#endif
diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h

index 5e47aee0627a4db59fffdd4f4f684bf750898d59..aedfe4500b8d1df48b5725fc437706758b13a0e0 100644 (file)
--- a/src/rgw/rgw_client_io.h
+++ b/src/rgw/rgw_client_io.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_CLIENT_IO_H
-#define CEPH_RGW_CLIENT_IO_H
+#pragma once
  
  #include <exception>
  #include <string>
@@ -434,5 +433,3 @@ public:
        std::istream(static_cast<RGWClientIOStreamBuf *>(this)) {
    }
  };
-
-#endif /* CEPH_RGW_CLIENT_IO_H */
diff --git a/src/rgw/rgw_client_io_filters.h b/src/rgw/rgw_client_io_filters.h

index 538d7f16723d64581c1e9618d161912bab5a05d1..55d405e1bb234088e30fa4c03bf4970f89231a06 100644 (file)
--- a/src/rgw/rgw_client_io_filters.h
+++ b/src/rgw/rgw_client_io_filters.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_CLIENT_IO_DECOIMPL_H
-#define CEPH_RGW_CLIENT_IO_DECOIMPL_H
+#pragma once
  
  #include <type_traits>
  
@@ -453,4 +452,3 @@ ReorderingFilter<T> add_reordering(T&& t) {
  
  } /* namespace io */
  } /* namespace rgw */
-#endif /* CEPH_RGW_CLIENT_IO_DECOIMPL_H */
diff --git a/src/rgw/rgw_compression.h b/src/rgw/rgw_compression.h

index e558f3bbd8ef625872a5c3699aeeeaa79674805e..84250bfe43ccbd26c6c3abc99c244ac1b199a563 100644 (file)
--- a/src/rgw/rgw_compression.h
+++ b/src/rgw/rgw_compression.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_COMPRESSION_H
-#define CEPH_RGW_COMPRESSION_H
+#pragma once
  
  #include <vector>
  
@@ -61,5 +60,3 @@ public:
    std::optional<int32_t> get_compressor_message() { return compressor_message; }
  
  }; /* RGWPutObj_Compress */
-
-#endif /* CEPH_RGW_COMPRESSION_H */
diff --git a/src/rgw/rgw_coroutine.h b/src/rgw/rgw_coroutine.h

index d43d0afbf87b6c48fc1d04f620c960c3572a720c..eb3216640c6410255946650ffc7d799e015da7ee 100644 (file)
--- a/src/rgw/rgw_coroutine.h
+++ b/src/rgw/rgw_coroutine.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_COROUTINE_H
-#define CEPH_RGW_COROUTINE_H
+#pragma once
  
  #ifdef _ASSERT_H
  #define NEED_ASSERT_H
@@ -721,5 +720,3 @@ public:
    virtual int finish() { return 0; }
    virtual void request_cleanup() {}
  };
-
-#endif
diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h

index 1620795e8b4801e884724cf751af4aab5d0b0ce8..17eaeeb82301c561c1364fd7c9cabe6e56d59ef0 100644 (file)
--- a/src/rgw/rgw_cors.h
+++ b/src/rgw/rgw_cors.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef CEPH_RGW_CORS_H
-#define CEPH_RGW_CORS_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -133,4 +132,3 @@ static inline int validate_name_string(std::string_view o) {
      return -1;
    return 0;
  }
-#endif /*CEPH_RGW_CORS_H*/
diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h

index f4ec8a1f95bff5ad0bd3d9e71d48d5b384202145..8d92a3c5fd34bc0466963a50bfc45276d1fcef27 100644 (file)
--- a/src/rgw/rgw_cors_s3.h
+++ b/src/rgw/rgw_cors_s3.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef CEPH_RGW_CORS_S3_H
-#define CEPH_RGW_CORS_S3_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -57,4 +56,3 @@ class RGWCORSXMLParser_S3 : public RGWXMLParser
  public:
    explicit RGWCORSXMLParser_S3(const DoutPrefixProvider *_dpp, CephContext *_cct) : dpp(_dpp), cct(_cct) {}
  };
-#endif /*CEPH_RGW_CORS_S3_H*/
diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h

index 3eff9bea32be5a1e15dad32ced58399d932a69f1..f5a1b14a0919ce1489a12b6b3ba0eab6513c9d69 100644 (file)
--- a/src/rgw/rgw_cors_swift.h
+++ b/src/rgw/rgw_cors_swift.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef CEPH_RGW_CORS_SWIFT3_H
-#define CEPH_RGW_CORS_SWIFT3_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -82,4 +81,3 @@ class RGWCORSConfiguration_SWIFT : public RGWCORSConfiguration
        return 0;
      }
  };
-#endif /*CEPH_RGW_CORS_SWIFT3_H*/
diff --git a/src/rgw/rgw_crypt.h b/src/rgw/rgw_crypt.h

index 33d6bc3bbc38c49324b998fc5903186b7aa396ff..6008dd05eaea61afd0e6f7df8b3f686d5c5a83de 100644 (file)
--- a/src/rgw/rgw_crypt.h
+++ b/src/rgw/rgw_crypt.h
@@ -5,8 +5,7 @@
   * Crypto filters for Put/Post/Get operations.
   */
  
-#ifndef CEPH_RGW_CRYPT_H
-#define CEPH_RGW_CRYPT_H
+#pragma once
  
  #include <string_view>
  
@@ -171,5 +170,3 @@ static inline std::string get_str_attribute(std::map<std::string, bufferlist>& a
  }
  
  int rgw_remove_sse_s3_bucket_key(req_state *s);
-
-#endif
diff --git a/src/rgw/rgw_crypt_sanitize.h b/src/rgw/rgw_crypt_sanitize.h

index 1f862089cbdf1f297209ecc52af5234e95caaac0..aa0261fc25188dfcc08ebc560f4a287b9ccd3bf6 100644 (file)
--- a/src/rgw/rgw_crypt_sanitize.h
+++ b/src/rgw/rgw_crypt_sanitize.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_RGW_CRYPT_SANITIZE_H_
-#define RGW_RGW_CRYPT_SANITIZE_H_
+#pragma once
  
  #include <string_view>
  #include "rgw_common.h"
@@ -67,4 +66,3 @@ std::ostream& operator<<(std::ostream& out, const auth& x);
  std::ostream& operator<<(std::ostream& out, const log_content& x);
  }
  }
-#endif /* RGW_RGW_CRYPT_SANITIZE_H_ */
diff --git a/src/rgw/rgw_d3n_cacherequest.h b/src/rgw/rgw_d3n_cacherequest.h

index ad93a689f9c68afc6295b1a20962f75f0b86a01e..eac8c7610eebaa0cd4b8e17fa76a28ec5ae2e36b 100644 (file)
--- a/src/rgw/rgw_d3n_cacherequest.h
+++ b/src/rgw/rgw_d3n_cacherequest.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_CACHEREQUEST_H
-#define RGW_CACHEREQUEST_H
+#pragma once
  
  #include <fcntl.h>
  #include <stdlib.h>
@@ -144,5 +143,3 @@ struct D3nL1CacheRequest {
    }
  
  };
-
-#endif
diff --git a/src/rgw/rgw_dmclock.h b/src/rgw/rgw_dmclock.h

index 6805c005000350d58100134a57df6e78c6622c45..6fad9cc1895ee3acffe2d6f930a84bf9a552bef8 100644 (file)
--- a/src/rgw/rgw_dmclock.h
+++ b/src/rgw/rgw_dmclock.h
@@ -14,8 +14,8 @@
   *
   */
  
-#ifndef RGW_DMCLOCK_H
-#define RGW_DMCLOCK_H
+#pragma once
+
  #include "dmclock/src/dmclock_server.h"
  
  namespace rgw::dmclock {
@@ -50,5 +50,3 @@ inline scheduler_t get_scheduler_t(CephContext* const cct)
  }
  
  } // namespace rgw::dmclock
-
-#endif /* RGW_DMCLOCK_H */
diff --git a/src/rgw/rgw_dmclock_async_scheduler.h b/src/rgw/rgw_dmclock_async_scheduler.h

index ccac09be7b0591036091cc5c10d2403e715a7c7f..7bde75870a5a2d8711d5f21439bfd4623b4650e0 100644 (file)
--- a/src/rgw/rgw_dmclock_async_scheduler.h
+++ b/src/rgw/rgw_dmclock_async_scheduler.h
@@ -12,8 +12,7 @@
   *
   */
  
-#ifndef RGW_DMCLOCK_ASYNC_SCHEDULER_H
-#define RGW_DMCLOCK_ASYNC_SCHEDULER_H
+#pragma once
  
  #include "common/async/completion.h"
  
@@ -216,4 +215,3 @@ private:
  };
  
  } // namespace rgw::dmclock
-#endif /* RGW_DMCLOCK_ASYNC_SCHEDULER_H */
diff --git a/src/rgw/rgw_dmclock_scheduler.h b/src/rgw/rgw_dmclock_scheduler.h

index f3dc229db384aa247c04b9cfe21130f76af082da..655e12bef352f7a895179ae5e2c68a73ef1722e2 100644 (file)
--- a/src/rgw/rgw_dmclock_scheduler.h
+++ b/src/rgw/rgw_dmclock_scheduler.h
@@ -12,8 +12,7 @@
   *
   */
  
-#ifndef RGW_DMCLOCK_SCHEDULER_H
-#define RGW_DMCLOCK_SCHEDULER_H
+#pragma once
  
  #include "common/ceph_time.h"
  #include "common/ceph_context.h"
@@ -85,5 +84,3 @@ private:
  };
  
  } // namespace rgw::dmclock
-
-#endif // RGW_DMCLOCK_SCHEDULER_H
diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.h b/src/rgw/rgw_dmclock_scheduler_ctx.h

index be3b2cc279415da14c70f4ff6852bf05008e7f8f..f27b81c266e55f642239ee7b0a20e25b3f72b079 100644 (file)
--- a/src/rgw/rgw_dmclock_scheduler_ctx.h
+++ b/src/rgw/rgw_dmclock_scheduler_ctx.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_DMCLOCK_SCHEDULER_CTX_H
-#define RGW_DMCLOCK_SCHEDULER_CTX_H
+#pragma once
  
  #include "common/perf_counters.h"
  #include "common/ceph_context.h"
@@ -118,5 +117,3 @@ private:
  };
  
  } // namespace rgw::dmclock
-
-#endif /* RGW_DMCLOCK_SCHEDULER_CTX_H */
diff --git a/src/rgw/rgw_dmclock_sync_scheduler.h b/src/rgw/rgw_dmclock_sync_scheduler.h

index 8b3edc3a6138317817156a0324785e87adaa9d00..740234965e93972eea0169414c365d23a60dcf91 100644 (file)
--- a/src/rgw/rgw_dmclock_sync_scheduler.h
+++ b/src/rgw/rgw_dmclock_sync_scheduler.h
@@ -12,8 +12,7 @@
   *
   */
  
-#ifndef RGW_DMCLOCK_SYNC_SCHEDULER_H
-#define RGW_DMCLOCK_SYNC_SCHEDULER_H
+#pragma once
  
  #include "rgw_dmclock_scheduler.h"
  #include "rgw_dmclock_scheduler_ctx.h"
@@ -76,4 +75,3 @@ SyncScheduler::SyncScheduler(CephContext *cct, GetClientCounters&& counters,
  {}
  
  } // namespace rgw::dmclock
-#endif /* RGW_DMCLOCK_SYNC_SCHEDULER_H */
diff --git a/src/rgw/rgw_es_query.h b/src/rgw/rgw_es_query.h

index 27cc36d75260e4297340e1e74f2763182682a7fd..f96e06f7505750c90deb1195058769c850d7b86c 100644 (file)
--- a/src/rgw/rgw_es_query.h
+++ b/src/rgw/rgw_es_query.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_ES_QUERY_H
-#define CEPH_RGW_ES_QUERY_H
+#pragma once
  
  #include "rgw_string.h"
  
@@ -163,6 +162,3 @@ public:
      return (restricted_fields && restricted_fields->find(f) != restricted_fields->end());
    }
  };
-
-
-#endif
diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h

index fbda86ba4cb39d1bf5e2256e67d38f7f34f4b461..fbbe782b73e0df3d3c2afc97afede53d2646e66c 100644 (file)
--- a/src/rgw/rgw_file.h
+++ b/src/rgw/rgw_file.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_FILE_H
-#define RGW_FILE_H
+#pragma once
  
  #include "include/rados/rgw_file.h"
  
@@ -2855,5 +2854,3 @@ public:
  
  
  } /* namespace rgw */
-
-#endif /* RGW_FILE_H */
diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h

index d7e47259d5839af018af57afca8f166325814cb6..12ad224a3dbbe6a3d30709003f8bba2131640487 100644 (file)
--- a/src/rgw/rgw_formats.h
+++ b/src/rgw/rgw_formats.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_FORMATS_H
-#define CEPH_RGW_FORMATS_H
+#pragma once
  
  #include "common/Formatter.h"
  
@@ -132,5 +131,3 @@ protected:
  public:
    RGWNullFlusher() : RGWFormatterFlusher(nullptr) {}
  };
-
-#endif
diff --git a/src/rgw/rgw_frontend.h b/src/rgw/rgw_frontend.h

index 34d3b06586cbd57aa684124fa5532017a8388365..4876fb8f85378a1af5d8126417729cb989e2d650 100644 (file)
--- a/src/rgw/rgw_frontend.h
+++ b/src/rgw/rgw_frontend.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_FRONTEND_H
-#define RGW_FRONTEND_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -210,5 +209,3 @@ class RGWFrontendPauser : public RGWRealmReloader::Pauser {
        pauser->resume(driver);
    }
  };
-
-#endif /* RGW_FRONTEND_H */
diff --git a/src/rgw/rgw_http_client.h b/src/rgw/rgw_http_client.h

index d20b28cf4f9403e6d3907aec7e6c46cef577b805..dbd705a1880d4dfd9db148f78f196a83cbd4339e 100644 (file)
--- a/src/rgw/rgw_http_client.h
+++ b/src/rgw/rgw_http_client.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_HTTP_CLIENT_H
-#define CEPH_RGW_HTTP_CLIENT_H
+#pragma once
  
  #include "common/async/yield_context.h"
  #include "common/Cond.h"
@@ -347,4 +346,3 @@ public:
    static int send(RGWHTTPClient *req);
    static int process(RGWHTTPClient *req, optional_yield y);
  };
-#endif
diff --git a/src/rgw/rgw_http_client_curl.h b/src/rgw/rgw_http_client_curl.h

index 2a49a2c36c4da900793dcf602ee4bdcc9c801784..a28826b0d839967fe938f4eed107f15ba3ae0b46 100644 (file)
--- a/src/rgw/rgw_http_client_curl.h
+++ b/src/rgw/rgw_http_client_curl.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef RGW_HTTP_CLIENT_CURL_H
-#define RGW_HTTP_CLIENT_CURL_H
+#pragma once
  
  #include <map>
  #include <boost/optional.hpp>
@@ -28,5 +27,3 @@ void setup_curl(boost::optional<const fe_map_t&> m);
  void cleanup_curl();
  }
  }
-
-#endif
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h

index d8674552ab6a3291c802ddf0325dd2edecf7c24b..5e052819e052e82f902c8922c7e5e313ce180027 100644 (file)
--- a/src/rgw/rgw_http_errors.h
+++ b/src/rgw/rgw_http_errors.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_HTTP_ERRORS_H_
-#define RGW_HTTP_ERRORS_H_
+#pragma once
  
  #include "rgw_common.h"
  
@@ -43,6 +42,3 @@ static inline int rgw_http_error_to_errno(int http_err)
  
    return 0; /* unreachable */
  }
-
-
-#endif
diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h

index 564ddd530c0b9ccdc3e9efe3d5c8d4ba6a79370b..ff6061c7550ea5621f9feae3de2ecb0da25439e9 100644 (file)
--- a/src/rgw/rgw_iam_policy.h
+++ b/src/rgw/rgw_iam_policy.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_IAM_POLICY_H
-#define CEPH_RGW_IAM_POLICY_H
+#pragma once
  
  #include <bitset>
  #include <chrono>
@@ -580,5 +579,3 @@ bool is_public(const Policy& p);
  
  }
  }
-
-#endif
diff --git a/src/rgw/rgw_iam_policy_keywords.h b/src/rgw/rgw_iam_policy_keywords.h

index 1c94dfe17103956c95c0acd7c43880da0cc239f8..8130ace456c60336fd698e2d99a494f9a593dc72 100644 (file)
--- a/src/rgw/rgw_iam_policy_keywords.h
+++ b/src/rgw/rgw_iam_policy_keywords.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_POLICY_S3V2_KEYWORDS_H
-#define CEPH_RGW_POLICY_S3V2_KEYWORDS_H
+#pragma once
  
  namespace rgw {
  namespace IAM {
@@ -138,5 +137,3 @@ enum class Type {
  };
  }
  }
-
-#endif // CEPH_RGW_POLICY_S3V2_KEYWORDS_H
diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h

index 84961e4f9d29aa105e9a286ba23d7f3e3d9283b7..0ba88278268dd5edc0ed77379a47058d2086e615 100644 (file)
--- a/src/rgw/rgw_keystone.h
+++ b/src/rgw/rgw_keystone.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_KEYSTONE_H
-#define CEPH_RGW_KEYSTONE_H
+#pragma once
  
  #include <atomic>
  #include <string_view>
@@ -332,5 +331,3 @@ public:
  
  }; /* namespace keystone */
  }; /* namespace rgw */
-
-#endif
diff --git a/src/rgw/rgw_kmip_client.h b/src/rgw/rgw_kmip_client.h

index efc7db325b90e8652f79cb4968c5bf67d91b5cc4..2992921136e5ddbeda81815c5b59ede5ac50706b 100644 (file)
--- a/src/rgw/rgw_kmip_client.h
+++ b/src/rgw/rgw_kmip_client.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_KMIP_CLIENT_H
-#define CEPH_RGW_KMIP_CLIENT_H
+#pragma once
  
  class RGWKMIPManager;
  
@@ -64,4 +63,3 @@ public:
  
  void rgw_kmip_client_init(RGWKMIPManager &);
  void rgw_kmip_client_cleanup();
-#endif
diff --git a/src/rgw/rgw_kmip_client_impl.h b/src/rgw/rgw_kmip_client_impl.h

index 841df87f4c336a2dcbb704daaa381d6cfe4bbc0a..d36903a4b79592574c579d1910e7aca64a273642 100644 (file)
--- a/src/rgw/rgw_kmip_client_impl.h
+++ b/src/rgw/rgw_kmip_client_impl.h
@@ -1,8 +1,8 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_KMIP_CLIENT_IMPL_H
-#define CEPH_RGW_KMIP_CLIENT_IMPL_H
+#pragma once
+
  struct RGWKmipWorker;
  class RGWKMIPManagerImpl: public RGWKMIPManager {
  protected:
@@ -25,5 +25,3 @@ public:
    void stop();
    friend RGWKmipWorker;
  };
-#endif
-
diff --git a/src/rgw/rgw_kms.h b/src/rgw/rgw_kms.h

index ba9b436139ed76be67eaaa1904975b371d47279f..f8e8655f261c9a1154181c5de6583088f5caec87 100644 (file)
--- a/src/rgw/rgw_kms.h
+++ b/src/rgw/rgw_kms.h
@@ -5,8 +5,7 @@
   * Server-side encryption integrations with Key Management Systems (SSE-KMS)
   */
  
-#ifndef CEPH_RGW_KMS_H
-#define CEPH_RGW_KMS_H
+#pragma once
  
  #include <string>
  
@@ -63,4 +62,3 @@ public:
    virtual int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) = 0;
    virtual ~SecretEngine(){};
  };
-#endif
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h

index e74b67fca6d052b0f67e659984e3cc07e09f321b..bd8efd9b6d03ec165924bb242451ec68846b6bed 100644 (file)
--- a/src/rgw/rgw_lc.h
+++ b/src/rgw/rgw_lc.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_LC_H
-#define CEPH_RGW_LC_H
+#pragma once
  
  #include <map>
  #include <array>
@@ -639,5 +638,3 @@ bool s3_multipart_abort_header(
    std::string& rule_id);
  
  } // namespace rgw::lc
-
-#endif
diff --git a/src/rgw/rgw_lc_s3.h b/src/rgw/rgw_lc_s3.h

index 84ffdc6c8ad83eca53f7c80d5a732277922fce82..5486aef3580571aa676d46b39144fd9f0e59ae00 100644 (file)
--- a/src/rgw/rgw_lc_s3.h
+++ b/src/rgw/rgw_lc_s3.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_LC_S3_H
-#define CEPH_RGW_LC_S3_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -99,5 +98,3 @@ public:
    int rebuild(RGWLifecycleConfiguration& dest);
    void dump_xml(Formatter *f) const;
  };
-
-#endif
diff --git a/src/rgw/rgw_ldap.h b/src/rgw/rgw_ldap.h

index 06986e5f59ac71ab87e18f46186a9fc2dc966663..05a48ce190005c147467f42e1bfd46f040d4bc0e 100644 (file)
--- a/src/rgw/rgw_ldap.h
+++ b/src/rgw/rgw_ldap.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_LDAP_H
-#define RGW_LDAP_H
+#pragma once
  
  #include "acconfig.h"
  
@@ -137,5 +136,3 @@ namespace rgw {
  #include "include/ceph_assert.h"
  
  std::string parse_rgw_ldap_bindpw(CephContext* ctx);
-
-#endif /* RGW_LDAP_H */
diff --git a/src/rgw/rgw_lib.h b/src/rgw/rgw_lib.h

index 02317ea8e0846481bde61cf5612537e7b4b8eb62..1ad54b49b4857ca9c05c4c220a785d205157a539 100644 (file)
--- a/src/rgw/rgw_lib.h
+++ b/src/rgw/rgw_lib.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_LIB_H
-#define RGW_LIB_H
+#pragma once
  
  #include <mutex>
  #include "rgw_common.h"
@@ -208,5 +207,3 @@ namespace rgw {
    }; /* RGWLibContinuedReq */
  
  } /* namespace rgw */
-
-#endif /* RGW_LIB_H */
diff --git a/src/rgw/rgw_lib_frontend.h b/src/rgw/rgw_lib_frontend.h

index 57e58c522e59606d5c68ed1b33fe86099b788190..1772724d218ed709307026ea4bcec02831a8cffb 100644 (file)
--- a/src/rgw/rgw_lib_frontend.h
+++ b/src/rgw/rgw_lib_frontend.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_LIB_FRONTEND_H
-#define RGW_LIB_FRONTEND_H
+#pragma once
  
  #include <boost/container/flat_map.hpp>
  
@@ -112,5 +111,3 @@ namespace rgw {
    }; /* RGWLibFrontend */
  
  } /* namespace rgw */
-
-#endif /* RGW_LIB_FRONTEND_H */
diff --git a/src/rgw/rgw_loadgen.h b/src/rgw/rgw_loadgen.h

index 5a0abca57f79c96cb48f0a6f475693501f1a5423..7f3f847c2b1021b746a1b4a29c688072bbaaa1b6 100644 (file)
--- a/src/rgw/rgw_loadgen.h
+++ b/src/rgw/rgw_loadgen.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_LOADGEN_H
-#define CEPH_RGW_LOADGEN_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -71,5 +70,3 @@ public:
  
    size_t complete_request() override;
  };
-
-#endif
diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h

index 0c97b2f8a64bbdc41e21949cf9bbe3889c605ca4..1dd79273e6a19514bf7856ded3d9a7485a927293 100644 (file)
--- a/src/rgw/rgw_log.h
+++ b/src/rgw/rgw_log.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_LOG_H
-#define CEPH_RGW_LOG_H
+#pragma once
  
  #include <boost/container/flat_map.hpp>
  #include "rgw_common.h"
@@ -288,6 +287,3 @@ void rgw_log_usage_init(CephContext* cct, rgw::sal::Driver* driver);
  void rgw_log_usage_finalize();
  void rgw_format_ops_log_entry(struct rgw_log_entry& entry,
                               ceph::Formatter *formatter);
-
-#endif /* CEPH_RGW_LOG_H */
-
diff --git a/src/rgw/rgw_meta_sync_status.h b/src/rgw/rgw_meta_sync_status.h

index 6272b471db01b7d984b52a87f7e8a359fc4805ce..f8a2ae3ee7ba3b94b6bdef143cc9973756ca86ea 100644 (file)
--- a/src/rgw/rgw_meta_sync_status.h
+++ b/src/rgw/rgw_meta_sync_status.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_META_SYNC_STATUS_H
-#define RGW_META_SYNC_STATUS_H
+#pragma once
  
  #include <string>
  
@@ -120,5 +119,3 @@ struct rgw_meta_sync_status {
    static void generate_test_instances(std::list<rgw_meta_sync_status*>& ls);
  };
  WRITE_CLASS_ENCODER(rgw_meta_sync_status)
-
-#endif
diff --git a/src/rgw/rgw_multi.h b/src/rgw/rgw_multi.h

index bb371e71c5e5ad6330629ee77bb7c04f6c554691..f57c90e7489569662b8c8c5cc91e37bd044cb634 100644 (file)
--- a/src/rgw/rgw_multi.h
+++ b/src/rgw/rgw_multi.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_MULTI_H
-#define CEPH_RGW_MULTI_H
+#pragma once
  
  #include <map>
  #include "rgw_xml.h"
@@ -61,5 +60,3 @@ public:
  };
  
  extern bool is_v2_upload_id(const std::string& upload_id);
-
-#endif
diff --git a/src/rgw/rgw_multi_del.h b/src/rgw/rgw_multi_del.h

index 6187aae37e8d33889732c518a1fa899fbb5aa260..b060decf420ad632aed2f9efa6debfd0f630cf0d 100644 (file)
--- a/src/rgw/rgw_multi_del.h
+++ b/src/rgw/rgw_multi_del.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_MULTI_DELETE_H_
-#define RGW_MULTI_DELETE_H_
+#pragma once
  
  #include <vector>
  #include "rgw_xml.h"
@@ -61,6 +60,3 @@ public:
    RGWMultiDelXMLParser() {}
    ~RGWMultiDelXMLParser() override {}
  };
-
-
-#endif
diff --git a/src/rgw/rgw_object_lock.h b/src/rgw/rgw_object_lock.h

index 997c660e4cd11a3e0c29676a5fbe586a2fce0b27..27c73feaec92bc9ffa90e1611b16d439353af8c2 100644 (file)
--- a/src/rgw/rgw_object_lock.h
+++ b/src/rgw/rgw_object_lock.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_OBJECT_LOCK_H
-#define CEPH_RGW_OBJECT_LOCK_H
+#pragma once
  
  #include <string>
  #include "common/ceph_time.h"
@@ -221,4 +220,3 @@ public:
    bool is_enabled() const;
  };
  WRITE_CLASS_ENCODER(RGWObjectLegalHold)
-#endif //CEPH_RGW_OBJECT_LOCK_H
diff --git a/src/rgw/rgw_oidc_provider.h b/src/rgw/rgw_oidc_provider.h

index c3b794df0d62a2e444bb329a0f377d67b8b5d548..581ee879a6422bacee5212a40988937860dfdf37 100644 (file)
--- a/src/rgw/rgw_oidc_provider.h
+++ b/src/rgw/rgw_oidc_provider.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_OIDC_PROVIDER_H
-#define CEPH_RGW_OIDC_PROVIDER_H
+#pragma once
  
  #include <string>
  
@@ -120,5 +119,3 @@ public:
  WRITE_CLASS_ENCODER(RGWOIDCProvider)
  
  } } // namespace rgw::sal
-#endif /* CEPH_RGW_OIDC_PROVIDER_H */
-
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h

index d0ff70b3132dd34646a8c62d01b8fa8cfe219273..a0e8b273ce0742b755dc9d490349eaad9c7bf776 100644 (file)
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -10,8 +10,7 @@
   * to provide additional virtual methods such as send_response or get_params.
   */
  
-#ifndef CEPH_RGW_OP_H
-#define CEPH_RGW_OP_H
+#pragma once
  
  #include <limits.h>
  
@@ -2669,5 +2668,3 @@ int rgw_policy_from_attrset(const DoutPrefixProvider *dpp,
                              CephContext *cct,
                              std::map<std::string, bufferlist>& attrset,
                              RGWAccessControlPolicy *policy);
-
-#endif /* CEPH_RGW_OP_H */
diff --git a/src/rgw/rgw_opa.h b/src/rgw/rgw_opa.h

index 38824c58aeb2f9cc44535667ba3fce07767efe5d..6fd3b21bdcf00a19286d82935d83736372a5ccd4 100644 (file)
--- a/src/rgw/rgw_opa.h
+++ b/src/rgw/rgw_opa.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_OPA_H
-#define RGW_OPA_H
+#pragma once
  
  #include "rgw_common.h"
  #include "rgw_op.h"
@@ -10,5 +9,3 @@
  /* authorize request using OPA */
  int rgw_opa_authorize(RGWOp*& op,
                        req_state* s);
-
-#endif /* RGW_OPA_H */
diff --git a/src/rgw/rgw_os_lib.h b/src/rgw/rgw_os_lib.h

index b65c7c22accb4d564f80b5a6b9756f92255da164..65df0a7266594813b58b20528b85d5fba60fadf5 100644 (file)
--- a/src/rgw/rgw_os_lib.h
+++ b/src/rgw/rgw_os_lib.h
@@ -1,12 +1,9 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_OS_LIB_H
-#define RGW_OS_LIB_H
+#pragma once
  
  #include <functional>
  #include "rgw_common.h"
  #include "rgw_lib.h"
  
-
-#endif /* RGW_OS_LIB_H */
diff --git a/src/rgw/rgw_period_history.h b/src/rgw/rgw_period_history.h

index 0d412c76a3bed84bfca022b93153e4fa40bca846..3d18fbf9e22710330967cba47941ad3973c6a01f 100644 (file)
--- a/src/rgw/rgw_period_history.h
+++ b/src/rgw/rgw_period_history.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_PERIOD_HISTORY_H
-#define RGW_PERIOD_HISTORY_H
+#pragma once
  
  #include <deque>
  #include <mutex>
@@ -113,5 +112,3 @@ class RGWPeriodHistory final {
    /// the current_history
    Cursor lookup(epoch_t realm_epoch);
  };
-
-#endif // RGW_PERIOD_HISTORY_H
diff --git a/src/rgw/rgw_period_puller.h b/src/rgw/rgw_period_puller.h

index 654029dd1c4efacb6ae1f8059abdce66d6275ae8..88138d36b8ca4a00b5cb635e67ad98ddf4bceb15 100644 (file)
--- a/src/rgw/rgw_period_puller.h
+++ b/src/rgw/rgw_period_puller.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_PERIOD_PULLER_H
-#define CEPH_RGW_PERIOD_PULLER_H
+#pragma once
  
  #include "rgw_period_history.h"
  #include "include/common_fwd.h"
@@ -23,5 +22,3 @@ class RGWPeriodPuller : public RGWPeriodHistory::Puller {
  
    int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y) override;
  };
-
-#endif // CEPH_RGW_PERIOD_PULLER_H
diff --git a/src/rgw/rgw_period_pusher.h b/src/rgw/rgw_period_pusher.h

index ae267a11e78e5dd15d2bf746824c717af7aa1f6a..3ea7bd7ddebe09b18b85b3895d1189780aa6f16a 100644 (file)
--- a/src/rgw/rgw_period_pusher.h
+++ b/src/rgw/rgw_period_pusher.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_PERIOD_PUSHER_H
-#define RGW_PERIOD_PUSHER_H
+#pragma once
  
  #include <memory>
  #include <mutex>
@@ -53,5 +52,3 @@ class RGWPeriodPusher final : public RGWRealmWatcher::Watcher,
    class CRThread; //< contains thread, coroutine manager, http manager
    std::unique_ptr<CRThread> cr_thread; //< thread to run the push coroutines
  };
-
-#endif // RGW_PERIOD_PUSHER_H
diff --git a/src/rgw/rgw_policy_s3.h b/src/rgw/rgw_policy_s3.h

index 14ad6c4e3af19018a5bf4f4be8b25ed78336272b..2a8a7ab096fd2decc2db01ebeeab40976ceb4227 100644 (file)
--- a/src/rgw/rgw_policy_s3.h
+++ b/src/rgw/rgw_policy_s3.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_POLICY_H
-#define CEPH_RGW_POLICY_H
+#pragma once
  
  #include <limits.h>
  
@@ -56,4 +55,3 @@ public:
    int check(RGWPolicyEnv *env, std::string& err_msg);
    int from_json(bufferlist& bl, std::string& err_msg);
  };
-#endif
diff --git a/src/rgw/rgw_process.h b/src/rgw/rgw_process.h

index 9d45362248e16a5fc4f1a21b4ef75e814515508b..67ebb710a4c29212789f25dec50817a58cd53045 100644 (file)
--- a/src/rgw/rgw_process.h
+++ b/src/rgw/rgw_process.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_PROCESS_H
-#define RGW_PROCESS_H
+#pragma once
  
  #include "rgw_common.h"
  #include "rgw_acl.h"
@@ -158,5 +157,3 @@ extern int rgw_process_authenticated(RGWHandler_REST* handler,
                                       bool skip_retarget = false);
  
  #undef dout_context
-
-#endif /* RGW_PROCESS_H */
diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc

deleted file mode 100644 (file)

index b9aa54b..0000000
--- a/src/rgw/rgw_pubsub.cc
+++ /dev/null
@@ -1,723 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "services/svc_zone.h"
-#include "rgw_b64.h"
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "rgw_pubsub.h"
-#include "rgw_tools.h"
-#include "rgw_xml.h"
-#include "rgw_arn.h"
-#include "rgw_pubsub_push.h"
-#include <regex>
-#include <algorithm>
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) {
-  char buf[64];
-  const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str());
-  if (len > 0) {
-    id.assign(buf, len);
-  }
-}
-
-bool rgw_s3_key_filter::decode_xml(XMLObj* obj) {
-  XMLObjIter iter = obj->find("FilterRule");
-  XMLObj *o;
-
-  const auto throw_if_missing = true;
-  auto prefix_not_set = true;
-  auto suffix_not_set = true;
-  auto regex_not_set = true;
-  std::string name;
-
-  while ((o = iter.get_next())) {
-    RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing);
-    if (name == "prefix" && prefix_not_set) {
-        prefix_not_set = false;
-        RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing);
-    } else if (name == "suffix" && suffix_not_set) {
-        suffix_not_set = false;
-        RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing);
-    } else if (name == "regex" && regex_not_set) {
-        regex_not_set = false;
-        RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing);
-    } else {
-        throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'");
-    }
-  }
-  return true;
-}
-
-void rgw_s3_key_filter::dump_xml(Formatter *f) const {
-  if (!prefix_rule.empty()) {
-    f->open_object_section("FilterRule");
-    ::encode_xml("Name", "prefix", f);
-    ::encode_xml("Value", prefix_rule, f);
-    f->close_section();
-  }
-  if (!suffix_rule.empty()) {
-    f->open_object_section("FilterRule");
-    ::encode_xml("Name", "suffix", f);
-    ::encode_xml("Value", suffix_rule, f);
-    f->close_section();
-  }
-  if (!regex_rule.empty()) {
-    f->open_object_section("FilterRule");
-    ::encode_xml("Name", "regex", f);
-    ::encode_xml("Value", regex_rule, f);
-    f->close_section();
-  }
-}
-
-bool rgw_s3_key_filter::has_content() const {
-    return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty());
-}
-
-bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) {
-  kv.clear();
-  XMLObjIter iter = obj->find("FilterRule");
-  XMLObj *o;
-
-  const auto throw_if_missing = true;
-
-  std::string key;
-  std::string value;
-
-  while ((o = iter.get_next())) {
-    RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing);
-    RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing);
-    kv.emplace(key, value);
-  }
-  return true;
-}
-
-void rgw_s3_key_value_filter::dump_xml(Formatter *f) const {
-  for (const auto& key_value : kv) {
-    f->open_object_section("FilterRule");
-    ::encode_xml("Name", key_value.first, f);
-    ::encode_xml("Value", key_value.second, f);
-    f->close_section();
-  }
-}
-
-bool rgw_s3_key_value_filter::has_content() const {
-    return !kv.empty();
-}
-
-bool rgw_s3_filter::decode_xml(XMLObj* obj) {
-    RGWXMLDecoder::decode_xml("S3Key", key_filter, obj);
-    RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj);
-    RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj);
-  return true;
-}
-
-void rgw_s3_filter::dump_xml(Formatter *f) const {
-  if (key_filter.has_content()) {
-      ::encode_xml("S3Key", key_filter, f);
-  }
-  if (metadata_filter.has_content()) {
-      ::encode_xml("S3Metadata", metadata_filter, f);
-  }
-  if (tag_filter.has_content()) {
-      ::encode_xml("S3Tags", tag_filter, f);
-  }
-}
-
-bool rgw_s3_filter::has_content() const {
-    return key_filter.has_content()  ||
-           metadata_filter.has_content() ||
-           tag_filter.has_content();
-}
-
-bool match(const rgw_s3_key_filter& filter, const std::string& key) {
-  const auto key_size = key.size();
-  const auto prefix_size = filter.prefix_rule.size();
-  if (prefix_size != 0) {
-    // prefix rule exists
-    if (prefix_size > key_size) {
-      // if prefix is longer than key, we fail
-      return false;
-    }
-    if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) {
-        return false;
-    }
-  }
-  const auto suffix_size = filter.suffix_rule.size();
-  if (suffix_size != 0) {
-    // suffix rule exists
-    if (suffix_size > key_size) {
-      // if suffix is longer than key, we fail
-      return false;
-    }
-    if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) {
-        return false;
-    }
-  }
-  if (!filter.regex_rule.empty()) {
-    // TODO add regex chaching in the filter
-    const std::regex base_regex(filter.regex_rule);
-    if (!std::regex_match(key, base_regex)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) {
-  // all filter pairs must exist with the same value in the object's metadata/tags
-  // object metadata/tags may include items not in the filter
-  return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end());
-}
-
-bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) {
-  // all filter pairs must exist with the same value in the object's metadata/tags
-  // object metadata/tags may include items not in the filter
-  for (auto& filter : filter.kv) {
-    auto result = kv.equal_range(filter.first);
-    if (std::any_of(result.first, result.second, [&filter](const pair<string,string>& p) { return p.second == filter.second;}))
-      continue;
-    else
-      return false;
-  }
-  return true;
-}
-
-bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) {
-  // if event list exists, and none of the events in the list matches the event type, filter the message
-  if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) {
-    return false;
-  }
-  return true;
-}
-
-void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) {
-  l.clear();
-
-  XMLObjIter iter = obj->find(name);
-  XMLObj *o;
-
-  while ((o = iter.get_next())) {
-    std::string val;
-    decode_xml_obj(val, o);
-    l.push_back(rgw::notify::from_string(val));
-  }
-}
-
-bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) {
-  const auto throw_if_missing = true;
-  RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing);
-  
-  RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing);
-  
-  RGWXMLDecoder::decode_xml("Filter", filter, obj);
-
-  do_decode_xml_obj(events, "Event", obj);
-  if (events.empty()) {
-    // if no events are provided, we assume all events
-    events.push_back(rgw::notify::ObjectCreated);
-    events.push_back(rgw::notify::ObjectRemoved);
-  }
-  return true;
-}
-
-void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const {
-  ::encode_xml("Id", id, f);
-  ::encode_xml("Topic", topic_arn.c_str(), f);
-  if (filter.has_content()) {
-      ::encode_xml("Filter", filter, f);
-  }
-  for (const auto& event : events) {
-    ::encode_xml("Event", rgw::notify::to_string(event), f);
-  }
-}
-
-bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) {
-  do_decode_xml_obj(list, "TopicConfiguration", obj);
-  return true;
-}
-
-rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) :
-    id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} 
-
-void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const {
-  do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f);
-}
-
-void rgw_pubsub_s3_event::dump(Formatter *f) const {
-  encode_json("eventVersion", eventVersion, f);
-  encode_json("eventSource", eventSource, f);
-  encode_json("awsRegion", awsRegion, f);
-  utime_t ut(eventTime);
-  encode_json("eventTime", ut, f);
-  encode_json("eventName", eventName, f);
-  {
-    Formatter::ObjectSection s(*f, "userIdentity");
-    encode_json("principalId", userIdentity, f);
-  }
-  {
-    Formatter::ObjectSection s(*f, "requestParameters");
-    encode_json("sourceIPAddress", sourceIPAddress, f);
-  }
-  {
-    Formatter::ObjectSection s(*f, "responseElements");
-    encode_json("x-amz-request-id", x_amz_request_id, f);
-    encode_json("x-amz-id-2", x_amz_id_2, f);
-  }
-  {
-    Formatter::ObjectSection s(*f, "s3");
-    encode_json("s3SchemaVersion", s3SchemaVersion, f);
-    encode_json("configurationId", configurationId, f);
-    {
-        Formatter::ObjectSection sub_s(*f, "bucket");
-        encode_json("name", bucket_name, f);
-        {
-            Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity");
-            encode_json("principalId", bucket_ownerIdentity, f);
-        }
-        encode_json("arn", bucket_arn, f);
-        encode_json("id", bucket_id, f);
-    }
-    {
-        Formatter::ObjectSection sub_s(*f, "object");
-        encode_json("key", object_key, f);
-        encode_json("size", object_size, f);
-        encode_json("eTag", object_etag, f);
-        encode_json("versionId", object_versionId, f);
-        encode_json("sequencer", object_sequencer, f);
-        encode_json("metadata", x_meta_map, f);
-        encode_json("tags", tags, f);
-    }
-  }
-  encode_json("eventId", id, f);
-  encode_json("opaqueData", opaque_data, f);
-}
-
-void rgw_pubsub_topic::dump(Formatter *f) const
-{
-  encode_json("user", user, f);
-  encode_json("name", name, f);
-  encode_json("dest", dest, f);
-  encode_json("arn", arn, f);
-  encode_json("opaqueData", opaque_data, f);
-}
-
-void rgw_pubsub_topic::dump_xml(Formatter *f) const
-{
-  encode_xml("User", user, f);
-  encode_xml("Name", name, f);
-  encode_xml("EndPoint", dest, f);
-  encode_xml("TopicArn", arn, f);
-  encode_xml("OpaqueData", opaque_data, f);
-}
-
-void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) {
-  f->open_object_section("entry");
-  encode_xml("key", key, f);
-  encode_xml("value", value, f);
-  f->close_section(); // entry
-}
-
-void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const
-{
-  f->open_array_section("Attributes");
-  std::string str_user;
-  user.to_str(str_user);
-  encode_xml_key_value_entry("User", str_user, f);
-  encode_xml_key_value_entry("Name", name, f);
-  encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f);
-  encode_xml_key_value_entry("TopicArn", arn, f);
-  encode_xml_key_value_entry("OpaqueData", opaque_data, f);
-  f->close_section(); // Attributes
-}
-
-void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f)
-{
-  f->open_array_section(name);
-  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
-    f->dump_string("obj", rgw::notify::to_string(*iter));
-  }
-  f->close_section();
-}
-
-void rgw_pubsub_topic_filter::dump(Formatter *f) const
-{
-  encode_json("topic", topic, f);
-  encode_json("events", events, f);
-}
-
-void rgw_pubsub_topic_subs::dump(Formatter *f) const
-{
-  encode_json("topic", topic, f);
-  encode_json("subs", subs, f);
-}
-
-void rgw_pubsub_bucket_topics::dump(Formatter *f) const
-{
-  Formatter::ArraySection s(*f, "topics");
-  for (auto& t : topics) {
-    encode_json(t.first.c_str(), t.second, f);
-  }
-}
-
-void rgw_pubsub_topics::dump(Formatter *f) const
-{
-  Formatter::ArraySection s(*f, "topics");
-  for (auto& t : topics) {
-    encode_json(t.first.c_str(), t.second, f);
-  }
-}
-
-void rgw_pubsub_topics::dump_xml(Formatter *f) const
-{
-  for (auto& t : topics) {
-    encode_xml("member", t.second.topic, f);
-  }
-}
-
-void rgw_pubsub_sub_dest::dump(Formatter *f) const
-{
-  encode_json("bucket_name", bucket_name, f);
-  encode_json("oid_prefix", oid_prefix, f);
-  encode_json("push_endpoint", push_endpoint, f);
-  encode_json("push_endpoint_args", push_endpoint_args, f);
-  encode_json("push_endpoint_topic", arn_topic, f);
-  encode_json("stored_secret", stored_secret, f);
-  encode_json("persistent", persistent, f);
-}
-
-void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const
-{
-  // first 2 members are omitted here since they
-  // dont apply to AWS compliant topics
-  encode_xml("EndpointAddress", push_endpoint, f);
-  encode_xml("EndpointArgs", push_endpoint_args, f);
-  encode_xml("EndpointTopic", arn_topic, f);
-  encode_xml("HasStoredSecret", stored_secret, f);
-  encode_xml("Persistent", persistent, f);
-}
-
-std::string rgw_pubsub_sub_dest::to_json_str() const
-{
-  // first 2 members are omitted here since they
-  // dont apply to AWS compliant topics
-  JSONFormatter f;
-  f.open_object_section("");
-  encode_json("EndpointAddress", push_endpoint, &f);
-  encode_json("EndpointArgs", push_endpoint_args, &f);
-  encode_json("EndpointTopic", arn_topic, &f);
-  encode_json("HasStoredSecret", stored_secret, &f);
-  encode_json("Persistent", persistent, &f);
-  f.close_section();
-  std::stringstream ss;
-  f.flush(ss);
-  return ss.str();
-}
-
-void rgw_pubsub_sub_config::dump(Formatter *f) const
-{
-  encode_json("user", user, f);
-  encode_json("name", name, f);
-  encode_json("topic", topic, f);
-  encode_json("dest", dest, f);
-  encode_json("s3_id", s3_id, f);
-}
-
-RGWPubSub::RGWPubSub(rgw::sal::RadosStore* _store, const std::string& _tenant)
-  : store(_store), tenant(_tenant), svc_sysobj(store->svc()->sysobj)
-{
-  get_meta_obj(&meta_obj);
-}
-
-int RGWPubSub::remove(const DoutPrefixProvider *dpp, 
-                          const rgw_raw_obj& obj,
-                         RGWObjVersionTracker *objv_tracker,
-                         optional_yield y)
-{
-  int ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, obj.pool, obj.oid, objv_tracker, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWPubSub::read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker *objv_tracker)
-{
-  int ret = read(meta_obj, result, objv_tracker);
-  if (ret < 0) {
-    ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
-                                    RGWObjVersionTracker *objv_tracker, optional_yield y)
-{
-  int ret = write(dpp, meta_obj, topics, objv_tracker, y);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-int RGWPubSub::get_topics(rgw_pubsub_topics *result)
-{
-  return read_topics(result, nullptr);
-}
-
-int RGWPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker)
-{
-  int ret = ps->read(bucket_meta_obj, result, objv_tracker);
-  if (ret < 0 && ret != -ENOENT) {
-    ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
-                                       RGWObjVersionTracker *objv_tracker,
-                                       optional_yield y)
-{
-  int ret = ps->write(dpp, bucket_meta_obj, topics, objv_tracker, y);
-  if (ret < 0) {
-    ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result)
-{
-  return read_topics(result, nullptr);
-}
-
-int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result)
-{
-  rgw_pubsub_topics topics;
-  int ret = get_topics(&topics);
-  if (ret < 0) {
-    ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  auto iter = topics.topics.find(name);
-  if (iter == topics.topics.end()) {
-    ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
-    return -ENOENT;
-  }
-
-  *result = iter->second;
-  return 0;
-}
-
-int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic *result)
-{
-  rgw_pubsub_topics topics;
-  int ret = get_topics(&topics);
-  if (ret < 0) {
-    ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  auto iter = topics.topics.find(name);
-  if (iter == topics.topics.end()) {
-    ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
-    return -ENOENT;
-  }
-
-  *result = iter->second.topic;
-  return 0;
-}
-
-int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y) {
-  return create_notification(dpp, topic_name, events, std::nullopt, "", y);
-}
-
-int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name,const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) {
-  rgw_pubsub_topic_subs topic_info;
-
-  int ret = ps->get_topic(topic_name, &topic_info);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl;
-    return ret;
-  }
-  ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl;
-
-  RGWObjVersionTracker objv_tracker;
-  rgw_pubsub_bucket_topics bucket_topics;
-
-  ret = read_topics(&bucket_topics, &objv_tracker);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" << 
-      bucket.name << "': ret=" << ret << dendl;
-    return ret;
-  }
-  ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << 
-    bucket.name << "'" << dendl;
-
-  auto& topic_filter = bucket_topics.topics[topic_name];
-  topic_filter.topic = topic_info.topic;
-  topic_filter.events = events;
-  topic_filter.s3_id = notif_name;
-  if (s3_filter) {
-    topic_filter.s3_filter = *s3_filter;
-  }
-
-  ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl;
-    return ret;
-  }
-    
-  ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl;
-
-  return 0;
-}
-
-int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const string& topic_name, optional_yield y)
-{
-  rgw_pubsub_topic_subs topic_info;
-
-  int ret = ps->get_topic(topic_name, &topic_info);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to read topic info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  RGWObjVersionTracker objv_tracker;
-  rgw_pubsub_bucket_topics bucket_topics;
-
-  ret = read_topics(&bucket_topics, &objv_tracker);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  bucket_topics.topics.erase(topic_name);
-
-  if (bucket_topics.topics.empty()) {
-    // no more topics - delete the notification object of the bucket
-    ret = ps->remove(dpp, bucket_meta_obj, &objv_tracker, y);
-    if (ret < 0 && ret != -ENOENT) {
-      ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
-      return ret;
-    }
-    return 0;
-  }
-
-  // write back the notifications without the deleted one
-  ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  // get all topics on a bucket
-  rgw_pubsub_bucket_topics bucket_topics;
-  auto ret  = get_topics(&bucket_topics);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket.name << "', ret=" << ret << dendl;
-    return ret ;
-  }
-
-  // remove all auto-genrated topics
-  for (const auto& topic : bucket_topics.topics) {
-    const auto& topic_name = topic.first;
-    ret = ps->remove_topic(dpp, topic_name, y);
-    if (ret < 0 && ret != -ENOENT) {
-      ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl;
-    }
-  }
-
-  // delete the notification object of the bucket
-  ret = ps->remove(dpp, bucket_meta_obj, nullptr, y);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) {
-  return create_topic(dpp, name, rgw_pubsub_sub_dest(), "", "", y);
-}
-
-int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y) {
-  RGWObjVersionTracker objv_tracker;
-  rgw_pubsub_topics topics;
-
-  int ret = read_topics(&topics, &objv_tracker);
-  if (ret < 0 && ret != -ENOENT) {
-    // its not an error if not topics exist, we create one
-    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
-    return ret;
-  }
- 
-  rgw_pubsub_topic_subs& new_topic = topics.topics[name];
-  new_topic.topic.user = rgw_user("", tenant);
-  new_topic.topic.name = name;
-  new_topic.topic.dest = dest;
-  new_topic.topic.arn = arn;
-  new_topic.topic.opaque_data = opaque_data;
-
-  ret = write_topics(dpp, topics, &objv_tracker, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y)
-{
-  RGWObjVersionTracker objv_tracker;
-  rgw_pubsub_topics topics;
-
-  int ret = read_topics(&topics, &objv_tracker);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
-    return ret;
-  } else if (ret == -ENOENT) {
-      // its not an error if no topics exist, just a no-op
-      ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl;
-      return 0;
-  }
-
-  topics.topics.erase(name);
-
-  ret = write_topics(dpp, topics, &objv_tracker, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-void RGWPubSub::get_meta_obj(rgw_raw_obj *obj) const {
-  *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, meta_oid());
-}
-
-void RGWPubSub::get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const {
-  *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, bucket_meta_oid(bucket));
-}
-
-void RGWPubSub::get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const {
-  *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sub_meta_oid(name));
-}
-
diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h

deleted file mode 100644 (file)

index c0e9e14..0000000
--- a/src/rgw/rgw_pubsub.h
+++ /dev/null
@@ -1,716 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_PUBSUB_H
-#define CEPH_RGW_PUBSUB_H
-
-#include "services/svc_sys_obj.h"
-#include "rgw_tools.h"
-#include "rgw_zone.h"
-#include "rgw_notify_event_type.h"
-#include <boost/container/flat_map.hpp>
-
-namespace rgw::sal { class RadosStore; }
-
-class XMLObj;
-
-struct rgw_s3_key_filter {
-  std::string prefix_rule;
-  std::string suffix_rule;
-  std::string regex_rule;
-
-  bool has_content() const;
-
-  bool decode_xml(XMLObj *obj);
-  void dump_xml(Formatter *f) const;
-  
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(prefix_rule, bl);
-    encode(suffix_rule, bl);
-    encode(regex_rule, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(prefix_rule, bl);
-    decode(suffix_rule, bl);
-    decode(regex_rule, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_s3_key_filter)
-
-using KeyValueMap = boost::container::flat_map<std::string, std::string>;
-using KeyMultiValueMap = std::multimap<std::string, std::string>;
-
-struct rgw_s3_key_value_filter {
-  KeyValueMap kv;
-  
-  bool has_content() const;
-  
-  bool decode_xml(XMLObj *obj);
-  void dump_xml(Formatter *f) const;
-  
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(kv, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(kv, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_s3_key_value_filter)
-
-struct rgw_s3_filter {
-  rgw_s3_key_filter key_filter;
-  rgw_s3_key_value_filter metadata_filter;
-  rgw_s3_key_value_filter tag_filter;
-
-  bool has_content() const;
-  
-  bool decode_xml(XMLObj *obj);
-  void dump_xml(Formatter *f) const;
-  
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(key_filter, bl);
-    encode(metadata_filter, bl);
-    encode(tag_filter, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
-    decode(key_filter, bl);
-    decode(metadata_filter, bl);
-    if (struct_v >= 2) {
-        decode(tag_filter, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_s3_filter)
-
-using OptionalFilter = std::optional<rgw_s3_filter>;
-
-struct rgw_pubsub_topic_filter;
-/* S3 notification configuration
- * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html
-<NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
-  <TopicConfiguration>
-    <Filter>
-      <S3Key>
-        <FilterRule>
-          <Name>suffix</Name>
-          <Value>jpg</Value>
-        </FilterRule>
-      </S3Key>
-      <S3Metadata>
-        <FilterRule>
-          <Name></Name>
-          <Value></Value>
-        </FilterRule>
-      </S3Metadata>
-      <S3Tags>
-        <FilterRule>
-          <Name></Name>
-          <Value></Value>
-        </FilterRule>
-      </S3Tags>
-    </Filter>
-    <Id>notification1</Id>
-    <Topic>arn:aws:sns:<region>:<account>:<topic></Topic>
-    <Event>s3:ObjectCreated:*</Event>
-    <Event>s3:ObjectRemoved:*</Event>
-  </TopicConfiguration>
-</NotificationConfiguration>
-*/
-struct rgw_pubsub_s3_notification {
-  // notification id
-  std::string id;
-  // types of events
-  rgw::notify::EventTypeList events;
-  // topic ARN
-  std::string topic_arn;
-  // filter rules
-  rgw_s3_filter filter;
-
-  bool decode_xml(XMLObj *obj);
-  void dump_xml(Formatter *f) const;
-
-  rgw_pubsub_s3_notification() = default;
-  // construct from rgw_pubsub_topic_filter (used by get/list notifications)
-  explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter);
-};
-
-// return true if the key matches the prefix/suffix/regex rules of the key filter
-bool match(const rgw_s3_key_filter& filter, const std::string& key);
-
-// return true if the key matches the metadata rules of the metadata filter
-bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv);
-
-// return true if the key matches the tag rules of the tag filter
-bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv);
-
-// return true if the event type matches (equal or contained in) one of the events in the list
-bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event);
-
-struct rgw_pubsub_s3_notifications {
-  std::list<rgw_pubsub_s3_notification> list;
-  bool decode_xml(XMLObj *obj);
-  void dump_xml(Formatter *f) const;
-};
-
-/* S3 event records structure
- * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
-{  
-"Records":[  
-  {
-    "eventVersion":""
-    "eventSource":"",
-    "awsRegion":"",
-    "eventTime":"",
-    "eventName":"",
-    "userIdentity":{  
-      "principalId":""
-    },
-    "requestParameters":{
-      "sourceIPAddress":""
-    },
-    "responseElements":{
-      "x-amz-request-id":"",
-      "x-amz-id-2":""
-    },
-    "s3":{
-      "s3SchemaVersion":"1.0",
-      "configurationId":"",
-      "bucket":{
-        "name":"",
-        "ownerIdentity":{
-          "principalId":""
-        },
-        "arn":""
-        "id": ""
-      },
-      "object":{
-        "key":"",
-        "size": ,
-        "eTag":"",
-        "versionId":"",
-        "sequencer": "",
-        "metadata": ""
-        "tags": ""
-      }
-    },
-    "eventId":"",
-  }
-]
-}*/
-
-struct rgw_pubsub_s3_event {
-  constexpr static const char* const json_type_plural = "Records";
-  std::string eventVersion = "2.2";
-  // aws:s3
-  std::string eventSource = "ceph:s3";
-  // zonegroup
-  std::string awsRegion;
-  // time of the request
-  ceph::real_time eventTime;
-  // type of the event
-  std::string eventName;
-  // user that sent the request
-  std::string userIdentity;
-  // IP address of source of the request (not implemented)
-  std::string sourceIPAddress;
-  // request ID (not implemented)
-  std::string x_amz_request_id;
-  // radosgw that received the request
-  std::string x_amz_id_2;
-  std::string s3SchemaVersion = "1.0";
-  // ID received in the notification request
-  std::string configurationId;
-  // bucket name
-  std::string bucket_name;
-  // bucket owner
-  std::string bucket_ownerIdentity;
-  // bucket ARN
-  std::string bucket_arn;
-  // object key
-  std::string object_key;
-  // object size
-  uint64_t object_size = 0;
-  // object etag
-  std::string object_etag;
-  // object version id bucket is versioned
-  std::string object_versionId;
-  // hexadecimal value used to determine event order for specific key
-  std::string object_sequencer;
-  // this is an rgw extension (not S3 standard)
-  // used to store a globally unique identifier of the event
-  // that could be used for acking or any other identification of the event
-  std::string id;
-  // this is an rgw extension holding the internal bucket id
-  std::string bucket_id;
-  // meta data
-  KeyValueMap x_meta_map;
-  // tags
-  KeyMultiValueMap tags;
-  // opaque data received from the topic
-  // could be used to identify the gateway
-  std::string opaque_data;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(4, 1, bl);
-    encode(eventVersion, bl);
-    encode(eventSource, bl);
-    encode(awsRegion, bl);
-    encode(eventTime, bl);
-    encode(eventName, bl);
-    encode(userIdentity, bl);
-    encode(sourceIPAddress, bl);
-    encode(x_amz_request_id, bl);
-    encode(x_amz_id_2, bl);
-    encode(s3SchemaVersion, bl);
-    encode(configurationId, bl);
-    encode(bucket_name, bl);
-    encode(bucket_ownerIdentity, bl);
-    encode(bucket_arn, bl);
-    encode(object_key, bl);
-    encode(object_size, bl);
-    encode(object_etag, bl);
-    encode(object_versionId, bl);
-    encode(object_sequencer, bl);
-    encode(id, bl);
-    encode(bucket_id, bl);
-    encode(x_meta_map, bl);
-    encode(tags, bl);
-    encode(opaque_data, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(4, bl);
-    decode(eventVersion, bl);
-    decode(eventSource, bl);
-    decode(awsRegion, bl);
-    decode(eventTime, bl);
-    decode(eventName, bl);
-    decode(userIdentity, bl);
-    decode(sourceIPAddress, bl);
-    decode(x_amz_request_id, bl);
-    decode(x_amz_id_2, bl);
-    decode(s3SchemaVersion, bl);
-    decode(configurationId, bl);
-    decode(bucket_name, bl);
-    decode(bucket_ownerIdentity, bl);
-    decode(bucket_arn, bl);
-    decode(object_key, bl);
-    decode(object_size, bl);
-    decode(object_etag, bl);
-    decode(object_versionId, bl);
-    decode(object_sequencer, bl);
-    decode(id, bl);
-    if (struct_v >= 2) {
-      decode(bucket_id, bl);
-      decode(x_meta_map, bl);
-    }
-    if (struct_v >= 3) {
-      decode(tags, bl);
-    }
-    if (struct_v >= 4) {
-      decode(opaque_data, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_s3_event)
-
-// setting a unique ID for an event based on object hash and timestamp
-void set_event_id(std::string& id, const std::string& hash, const utime_t& ts);
-
-struct rgw_pubsub_sub_dest {
-  std::string bucket_name;
-  std::string oid_prefix;
-  std::string push_endpoint;
-  std::string push_endpoint_args;
-  std::string arn_topic;
-  bool stored_secret = false;
-  bool persistent = false;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(5, 1, bl);
-    encode(bucket_name, bl);
-    encode(oid_prefix, bl);
-    encode(push_endpoint, bl);
-    encode(push_endpoint_args, bl);
-    encode(arn_topic, bl);
-    encode(stored_secret, bl);
-    encode(persistent, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(5, bl);
-    decode(bucket_name, bl);
-    decode(oid_prefix, bl);
-    decode(push_endpoint, bl);
-    if (struct_v >= 2) {
-        decode(push_endpoint_args, bl);
-    }
-    if (struct_v >= 3) {
-        decode(arn_topic, bl);
-    }
-    if (struct_v >= 4) {
-        decode(stored_secret, bl);
-    }
-    if (struct_v >= 5) {
-        decode(persistent, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void dump_xml(Formatter *f) const;
-  std::string to_json_str() const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest)
-
-struct rgw_pubsub_sub_config {
-  rgw_user user;
-  std::string name;
-  std::string topic;
-  rgw_pubsub_sub_dest dest;
-  std::string s3_id;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(user, bl);
-    encode(name, bl);
-    encode(topic, bl);
-    encode(dest, bl);
-    encode(s3_id, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
-    decode(user, bl);
-    decode(name, bl);
-    decode(topic, bl);
-    decode(dest, bl);
-    if (struct_v >= 2) {
-      decode(s3_id, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_sub_config)
-
-struct rgw_pubsub_topic {
-  rgw_user user;
-  std::string name;
-  rgw_pubsub_sub_dest dest;
-  std::string arn;
-  std::string opaque_data;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(3, 1, bl);
-    encode(user, bl);
-    encode(name, bl);
-    encode(dest, bl);
-    encode(arn, bl);
-    encode(opaque_data, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(3, bl);
-    decode(user, bl);
-    decode(name, bl);
-    if (struct_v >= 2) {
-      decode(dest, bl);
-      decode(arn, bl);
-    }
-    if (struct_v >= 3) {
-      decode(opaque_data, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  std::string to_str() const {
-    return user.tenant + "/" + name;
-  }
-
-  void dump(Formatter *f) const;
-  void dump_xml(Formatter *f) const;
-  void dump_xml_as_attributes(Formatter *f) const;
-
-  bool operator<(const rgw_pubsub_topic& t) const {
-    return to_str().compare(t.to_str());
-  }
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topic)
-
-struct rgw_pubsub_topic_subs {
-  rgw_pubsub_topic topic;
-  std::set<std::string> subs;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(topic, bl);
-    encode(subs, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(topic, bl);
-    decode(subs, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs)
-
-struct rgw_pubsub_topic_filter {
-  rgw_pubsub_topic topic;
-  rgw::notify::EventTypeList events;
-  std::string s3_id;
-  rgw_s3_filter s3_filter;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(3, 1, bl);
-    encode(topic, bl);
-    // events are stored as a vector of std::strings
-    std::vector<std::string> tmp_events;
-    std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string);
-    encode(tmp_events, bl);
-    encode(s3_id, bl);
-    encode(s3_filter, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(3, bl);
-    decode(topic, bl);
-    // events are stored as a vector of std::strings
-    events.clear();
-    std::vector<std::string> tmp_events;
-    decode(tmp_events, bl);
-    std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string);
-    if (struct_v >= 2) {
-      decode(s3_id, bl);
-    }
-    if (struct_v >= 3) {
-      decode(s3_filter, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter)
-
-struct rgw_pubsub_bucket_topics {
-  std::map<std::string, rgw_pubsub_topic_filter> topics;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(topics, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(topics, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics)
-
-struct rgw_pubsub_topics {
-  std::map<std::string, rgw_pubsub_topic_subs> topics;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(topics, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(topics, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void dump_xml(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topics)
-
-static std::string pubsub_oid_prefix = "pubsub.";
-
-class RGWPubSub
-{
-  friend class Bucket;
-
-  rgw::sal::RadosStore* store;
-  const std::string tenant;
-  RGWSI_SysObj* svc_sysobj;
-
-  rgw_raw_obj meta_obj;
-
-  std::string meta_oid() const {
-    return pubsub_oid_prefix + tenant;
-  }
-
-  std::string bucket_meta_oid(const rgw_bucket& bucket) const {
-    return pubsub_oid_prefix + tenant + ".bucket." + bucket.name + "/" + bucket.marker;
-  }
-
-  std::string sub_meta_oid(const std::string& name) const {
-    return pubsub_oid_prefix + tenant + ".sub." + name;
-  }
-
-  template <class T>
-  int read(const rgw_raw_obj& obj, T* data, RGWObjVersionTracker* objv_tracker);
-
-  template <class T>
-  int write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
-           RGWObjVersionTracker* obj_tracker, optional_yield y);
-
-  int remove(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, RGWObjVersionTracker* objv_tracker,
-            optional_yield y);
-
-  int read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker* objv_tracker);
-  int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
-                       RGWObjVersionTracker* objv_tracker, optional_yield y);
-
-public:
-  RGWPubSub(rgw::sal::RadosStore* _store, const std::string& tenant);
-
-  class Bucket {
-    friend class RGWPubSub;
-    RGWPubSub *ps;
-    rgw_bucket bucket;
-    rgw_raw_obj bucket_meta_obj;
-
-    // read the list of topics associated with a bucket and populate into result
-    // use version tacker to enforce atomicity between read/write
-    // return 0 on success or if no topic was associated with the bucket, error code otherwise
-    int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker* objv_tracker);
-    // set the list of topics associated with a bucket
-    // use version tacker to enforce atomicity between read/write
-    // return 0 on success, error code otherwise
-    int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
-                    RGWObjVersionTracker* objv_tracker, optional_yield y);
-  public:
-    Bucket(RGWPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) {
-      ps->get_bucket_meta_obj(bucket, &bucket_meta_obj);
-    }
-
-    // read the list of topics associated with a bucket and populate into result
-    // return 0 on success or if no topic was associated with the bucket, error code otherwise
-    int get_topics(rgw_pubsub_bucket_topics *result);
-    // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket
-    // assigning a notification name is optional (needed for S3 compatible notifications)
-    // if the topic already exist on the bucket, the filter event list may be updated
-    // for S3 compliant notifications the version with: s3_filter and notif_name should be used
-    // return -ENOENT if the topic does not exists
-    // return 0 on success, error code otherwise
-    int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y);
-    int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y);
-    // remove a topic and filter from bucket
-    // if the topic does not exists on the bucket it is a no-op (considered success)
-    // return -ENOENT if the topic does not exists
-    // return 0 on success, error code otherwise
-    int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y);
-    // remove all notifications (and autogenerated topics) associated with the bucket
-    // return 0 on success or if no topic was associated with the bucket, error code otherwise
-    int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y);
-  };
-
-  using BucketRef = std::shared_ptr<Bucket>;
-
-  BucketRef get_bucket(const rgw_bucket& bucket) {
-    return std::make_shared<Bucket>(this, bucket);
-  }
-
-  void get_meta_obj(rgw_raw_obj *obj) const;
-  void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const;
-
-  void get_sub_meta_obj(const std::string& name, rgw_raw_obj *obj) const;
-
-  // get all topics (per tenant, if used)) and populate them into "result"
-  // return 0 on success or if no topics exist, error code otherwise
-  int get_topics(rgw_pubsub_topics *result);
-  // get a topic with its subscriptions by its name and populate it into "result"
-  // return -ENOENT if the topic does not exists 
-  // return 0 on success, error code otherwise
-  int get_topic(const std::string& name, rgw_pubsub_topic_subs *result);
-  // get a topic with by its name and populate it into "result"
-  // return -ENOENT if the topic does not exists 
-  // return 0 on success, error code otherwise
-  int get_topic(const std::string& name, rgw_pubsub_topic *result);
-  // create a topic with a name only
-  // if the topic already exists it is a no-op (considered success)
-  // return 0 on success, error code otherwise
-  int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
-  // create a topic with push destination information and ARN
-  // if the topic already exists the destination and ARN values may be updated (considered succsess)
-  // return 0 on success, error code otherwise
-  int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y);
-  // remove a topic according to its name
-  // if the topic does not exists it is a no-op (considered success)
-  // return 0 on success, error code otherwise
-  int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
-};
-
-
-template <class T>
-int RGWPubSub::read(const rgw_raw_obj& obj, T* result, RGWObjVersionTracker* objv_tracker)
-{
-  bufferlist bl;
-  int ret = rgw_get_system_obj(svc_sysobj,
-                               obj.pool, obj.oid,
-                               bl,
-                               objv_tracker,
-                               nullptr, null_yield, nullptr, nullptr);
-  if (ret < 0) {
-    return ret;
-  }
-
-  auto iter = bl.cbegin();
-  try {
-    decode(*result, iter);
-  } catch (buffer::error& err) {
-    return -EIO;
-  }
-
-  return 0;
-}
-
-template <class T>
-int RGWPubSub::write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
-                        RGWObjVersionTracker* objv_tracker, optional_yield y)
-{
-  bufferlist bl;
-  encode(info, bl);
-
-  return rgw_put_system_obj(dpp, svc_sysobj, obj.pool, obj.oid,
-                            bl, false, objv_tracker, real_time(), y);
-}
-
-#endif
diff --git a/src/rgw/rgw_pubsub_push.cc b/src/rgw/rgw_pubsub_push.cc

deleted file mode 100644 (file)

index 2f734c2..0000000
--- a/src/rgw/rgw_pubsub_push.cc
+++ /dev/null
@@ -1,463 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_pubsub_push.h"
-#include <string>
-#include <sstream>
-#include <algorithm>
-#include "include/buffer_fwd.h"
-#include "common/Formatter.h"
-#include "common/iso_8601.h"
-#include "common/async/completion.h"
-#include "rgw_common.h"
-#include "rgw_data_sync.h"
-#include "rgw_pubsub.h"
-#include "acconfig.h"
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
-#include "rgw_amqp.h"
-#endif
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-#include "rgw_kafka.h"
-#endif
-#include <boost/asio/yield.hpp>
-#include <boost/algorithm/string.hpp>
-#include <functional>
-#include "rgw_perf_counters.h"
-
-using namespace rgw;
-
-template<typename EventType>
-std::string json_format_pubsub_event(const EventType& event) {
-  std::stringstream ss;
-  JSONFormatter f(false);
-  {
-    Formatter::ObjectSection s(f, EventType::json_type_plural);
-    {
-      Formatter::ArraySection s(f, EventType::json_type_plural);
-      encode_json("", event, &f);
-    }
-  }
-  f.flush(ss);
-  return ss.str();
-}
-  
-bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) {
-  bool value;
-  bool exists;
-  if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) {
-    throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name);
-  }
-  if (!exists) {
-    return default_value;
-  }
-  return value;
-}
-
-class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
-private:
-  const std::string endpoint;
-  typedef unsigned ack_level_t;
-  ack_level_t ack_level; // TODO: not used for now
-  const bool verify_ssl;
-  const bool cloudevents;
-  static const ack_level_t ACK_LEVEL_ANY = 0;
-  static const ack_level_t ACK_LEVEL_NON_ERROR = 1;
-
-public:
-  RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) : 
-    endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false)) 
-  {
-    bool exists;
-    const auto& str_ack_level = args.get("http-ack-level", &exists);
-    if (!exists || str_ack_level == "any") {
-      // "any" is default
-      ack_level = ACK_LEVEL_ANY;
-    } else if (str_ack_level == "non-error") {
-      ack_level = ACK_LEVEL_NON_ERROR;
-    } else {
-      ack_level = std::atoi(str_ack_level.c_str());
-      if (ack_level < 100 || ack_level >= 600) {
-        throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level);
-      }
-    }
-  }
-
-  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
-    bufferlist read_bl;
-    RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl);
-    const auto post_data = json_format_pubsub_event(event);
-    if (cloudevents) {
-      // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md
-      // using "Binary Content Mode"
-      request.append_header("ce-specversion", "1.0");
-      request.append_header("ce-type", "com.amazonaws." + event.eventName);
-      request.append_header("ce-time", to_iso_8601(event.eventTime)); 
-      // default output of iso8601 is also RFC3339 compatible
-      request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2);
-      request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name);
-      request.append_header("ce-subject", event.object_key);
-    }
-    request.set_post_data(post_data);
-    request.set_send_length(post_data.length());
-    request.append_header("Content-Type", "application/json");
-    if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
-    const auto rc = RGWHTTP::process(&request, y);
-    if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
-    // TODO: use read_bl to process return code and handle according to ack level
-    return rc;
-  }
-
-  std::string to_str() const override {
-    std::string str("HTTP/S Endpoint");
-    str += "\nURI: " + endpoint;
-    str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL");
-    return str;
-  }
-};
-
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
-class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
-private:
-  enum class ack_level_t {
-    None,
-    Broker,
-    Routable
-  };
-  CephContext* const cct;
-  const std::string endpoint;
-  const std::string topic;
-  const std::string exchange;
-  ack_level_t ack_level;
-  amqp::connection_ptr_t conn;
-
-  bool get_verify_ssl(const RGWHTTPArgs& args) {
-    bool exists;
-    auto str_verify_ssl = args.get("verify-ssl", &exists);
-    if (!exists) {
-      // verify server certificate by default
-      return true;
-    }
-    boost::algorithm::to_lower(str_verify_ssl);
-    if (str_verify_ssl == "true") {
-      return true;
-    }
-    if (str_verify_ssl == "false") {
-      return false;
-    }
-    throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl);
-  }
-
-  std::string get_exchange(const RGWHTTPArgs& args) {
-    bool exists;
-    const auto exchange = args.get("amqp-exchange", &exists);
-    if (!exists) {
-      throw configuration_error("AMQP: missing amqp-exchange");
-    }
-    return exchange;
-  }
-
-  ack_level_t get_ack_level(const RGWHTTPArgs& args) {
-    bool exists;
-    const auto& str_ack_level = args.get("amqp-ack-level", &exists);
-    if (!exists || str_ack_level == "broker") {
-      // "broker" is default
-      return ack_level_t::Broker;
-    }
-    if (str_ack_level == "none") {
-      return ack_level_t::None;
-    }
-    if (str_ack_level == "routable") {
-      return ack_level_t::Routable;
-    }
-    throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level);
-  }
-  
-public:
-  RGWPubSubAMQPEndpoint(const std::string& _endpoint,
-      const std::string& _topic,
-      const RGWHTTPArgs& args,
-      CephContext* _cct) : 
-        cct(_cct),
-        endpoint(_endpoint), 
-        topic(_topic),
-        exchange(get_exchange(args)),
-        ack_level(get_ack_level(args)),
-        conn(amqp::connect(endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) {
-    if (!conn) { 
-      throw configuration_error("AMQP: failed to create connection to: " + endpoint);
-    }
-  }
-
-  // this allows waiting untill "finish()" is called from a different thread
-  // waiting could be blocking the waiting thread or yielding, depending
-  // with compilation flag support and whether the optional_yield is set
-  class Waiter {
-    using Signature = void(boost::system::error_code);
-    using Completion = ceph::async::Completion<Signature>;
-    std::unique_ptr<Completion> completion = nullptr;
-    int ret;
-
-    mutable std::atomic<bool> done = false;
-    mutable std::mutex lock;
-    mutable std::condition_variable cond;
-
-    template <typename ExecutionContext, typename CompletionToken>
-    auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
-      boost::asio::async_completion<CompletionToken, Signature> init(token);
-      auto& handler = init.completion_handler;
-      {
-        std::unique_lock l{lock};
-        completion = Completion::create(ctx.get_executor(), std::move(handler));
-      }
-      return init.result.get();
-    }
-
-  public:
-    int wait(optional_yield y) {
-      if (done) {
-        return ret;
-      }
-      if (y) {
-       auto& io_ctx = y.get_io_context();
-        auto& yield_ctx = y.get_yield_context();
-        boost::system::error_code ec;
-        async_wait(io_ctx, yield_ctx[ec]);
-        return -ec.value();
-      }
-      std::unique_lock l(lock);
-      cond.wait(l, [this]{return (done==true);});
-      return ret;
-    }
-
-    void finish(int r) {
-      std::unique_lock l{lock};
-      ret = r;
-      done = true;
-      if (completion) {
-        boost::system::error_code ec(-ret, boost::system::system_category());
-        Completion::post(std::move(completion), ec);
-      } else {
-        cond.notify_all();
-      }
-    }
-  };
-
-  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
-    ceph_assert(conn);
-    if (ack_level == ack_level_t::None) {
-      return amqp::publish(conn, topic, json_format_pubsub_event(event));
-    } else {
-      // TODO: currently broker and routable are the same - this will require different flags but the same mechanism
-      // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
-      auto w = std::unique_ptr<Waiter>(new Waiter);
-      const auto rc = amqp::publish_with_confirm(conn, 
-        topic,
-        json_format_pubsub_event(event),
-        std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
-      if (rc < 0) {
-        // failed to publish, does not wait for reply
-        return rc;
-      }
-      return w->wait(y);
-    }
-  }
-
-  std::string to_str() const override {
-    std::string str("AMQP(0.9.1) Endpoint");
-    str += "\nURI: " + endpoint;
-    str += "\nTopic: " + topic;
-    str += "\nExchange: " + exchange;
-    return str;
-  }
-};
-
-static const std::string AMQP_0_9_1("0-9-1");
-static const std::string AMQP_1_0("1-0");
-static const std::string AMQP_SCHEMA("amqp");
-#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT
-
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
-private:
-  enum class ack_level_t {
-    None,
-    Broker,
-  };
-  CephContext* const cct;
-  const std::string topic;
-  kafka::connection_ptr_t conn;
-  const ack_level_t ack_level;
-
-
-  ack_level_t get_ack_level(const RGWHTTPArgs& args) {
-    bool exists;
-    const auto& str_ack_level = args.get("kafka-ack-level", &exists);
-    if (!exists || str_ack_level == "broker") {
-      // "broker" is default
-      return ack_level_t::Broker;
-    }
-    if (str_ack_level == "none") {
-      return ack_level_t::None;
-    }
-    throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level);
-  }
-
-public:
-  RGWPubSubKafkaEndpoint(const std::string& _endpoint,
-      const std::string& _topic,
-      const RGWHTTPArgs& args,
-      CephContext* _cct) : 
-        cct(_cct),
-        topic(_topic),
-        conn(kafka::connect(_endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"))) ,
-        ack_level(get_ack_level(args)) {
-    if (!conn) { 
-      throw configuration_error("Kafka: failed to create connection to: " + _endpoint);
-    }
-  }
-
-  // this allows waiting untill "finish()" is called from a different thread
-  // waiting could be blocking the waiting thread or yielding, depending
-  // with compilation flag support and whether the optional_yield is set
-  class Waiter {
-    using Signature = void(boost::system::error_code);
-    using Completion = ceph::async::Completion<Signature>;
-    std::unique_ptr<Completion> completion = nullptr;
-    int ret;
-
-    mutable std::atomic<bool> done = false;
-    mutable std::mutex lock;
-    mutable std::condition_variable cond;
-
-    template <typename ExecutionContext, typename CompletionToken>
-    auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
-      boost::asio::async_completion<CompletionToken, Signature> init(token);
-      auto& handler = init.completion_handler;
-      {
-        std::unique_lock l{lock};
-        completion = Completion::create(ctx.get_executor(), std::move(handler));
-      }
-      return init.result.get();
-    }
-
-  public:
-    int wait(optional_yield y) {
-      if (done) {
-        return ret;
-      }
-      if (y) {
-        auto& io_ctx = y.get_io_context();
-        auto& yield_ctx = y.get_yield_context();
-        boost::system::error_code ec;
-        async_wait(io_ctx, yield_ctx[ec]);
-        return -ec.value();
-      }
-      std::unique_lock l(lock);
-      cond.wait(l, [this]{return (done==true);});
-      return ret;
-    }
-
-    void finish(int r) {
-      std::unique_lock l{lock};
-      ret = r;
-      done = true;
-      if (completion) {
-        boost::system::error_code ec(-ret, boost::system::system_category());
-        Completion::post(std::move(completion), ec);
-      } else {
-        cond.notify_all();
-      }
-    }
-  };
-
-  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
-    ceph_assert(conn);
-    if (ack_level == ack_level_t::None) {
-      return kafka::publish(conn, topic, json_format_pubsub_event(event));
-    } else {
-      // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
-      auto w = std::unique_ptr<Waiter>(new Waiter);
-      const auto rc = kafka::publish_with_confirm(conn, 
-        topic,
-        json_format_pubsub_event(event),
-        std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
-      if (rc < 0) {
-        // failed to publish, does not wait for reply
-        return rc;
-      }
-      return w->wait(y);
-    }
-  }
-
-  std::string to_str() const override {
-    std::string str("Kafka Endpoint");
-    str += kafka::to_string(conn);
-    str += "\nTopic: " + topic;
-    return str;
-  }
-};
-
-static const std::string KAFKA_SCHEMA("kafka");
-#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-
-static const std::string WEBHOOK_SCHEMA("webhook");
-static const std::string UNKNOWN_SCHEMA("unknown");
-static const std::string NO_SCHEMA("");
-
-const std::string& get_schema(const std::string& endpoint) {
-  if (endpoint.empty()) {
-    return NO_SCHEMA; 
-  }
-  const auto pos = endpoint.find(':');
-  if (pos == std::string::npos) {
-    return UNKNOWN_SCHEMA;
-  }
-  const auto& schema = endpoint.substr(0,pos);
-  if (schema == "http" || schema == "https") {
-    return WEBHOOK_SCHEMA;
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
-  } else if (schema == "amqp" || schema == "amqps") {
-    return AMQP_SCHEMA;
-#endif
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-  } else if (schema == "kafka") {
-    return KAFKA_SCHEMA;
-#endif
-  }
-  return UNKNOWN_SCHEMA;
-}
-
-RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, 
-    const std::string& topic, 
-    const RGWHTTPArgs& args,
-    CephContext* cct) {
-  const auto& schema = get_schema(endpoint);
-  if (schema == WEBHOOK_SCHEMA) {
-    return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args));
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
-  } else if (schema == AMQP_SCHEMA) {
-    bool exists;
-    std::string version = args.get("amqp-version", &exists);
-    if (!exists) {
-      version = AMQP_0_9_1;
-    }
-    if (version == AMQP_0_9_1) {
-      return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct));
-    } else if (version == AMQP_1_0) {
-      throw configuration_error("AMQP: v1.0 not supported");
-      return nullptr;
-    } else {
-      throw configuration_error("AMQP: unknown version: " + version);
-      return nullptr;
-    }
-#endif
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-  } else if (schema == KAFKA_SCHEMA) {
-      return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct));
-#endif
-  }
-
-  throw configuration_error("unknown schema in: " + endpoint);
-  return nullptr;
-}
-
diff --git a/src/rgw/rgw_pubsub_push.h b/src/rgw/rgw_pubsub_push.h

deleted file mode 100644 (file)

index 1790593..0000000
--- a/src/rgw/rgw_pubsub_push.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-#pragma once
-
-#include <string>
-#include <memory>
-#include <stdexcept>
-#include "include/buffer_fwd.h"
-#include "include/common_fwd.h"
-#include "common/async/yield_context.h"
-
-// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes
-class RGWDataSyncEnv;
-class RGWHTTPArgs;
-struct rgw_pubsub_s3_event;
-
-// endpoint base class all endpoint  - types should derive from it
-class RGWPubSubEndpoint {
-public:
-  RGWPubSubEndpoint() = default;
-  // endpoint should not be copied
-  RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete;
-  const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete;
-
-  typedef std::unique_ptr<RGWPubSubEndpoint> Ptr;
-
-  // factory method for the actual notification endpoint
-  // derived class specific arguments are passed in http args format
-  // may throw a configuration_error if creation fails
-  static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr);
- 
-  // this method is used in order to send notification (S3 compliant) and wait for completion 
-  // in async manner via a coroutine when invoked in the frontend environment
-  virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0;
-
-  // present as string
-  virtual std::string to_str() const { return ""; }
-  
-  virtual ~RGWPubSubEndpoint() = default;
-  
-  // exception object for configuration error
-  struct configuration_error : public std::logic_error {
-    configuration_error(const std::string& what_arg) : 
-      std::logic_error("pubsub endpoint configuration error: " + what_arg) {}
-  };
-};
-
diff --git a/src/rgw/rgw_putobj_processor.cc b/src/rgw/rgw_putobj_processor.cc

deleted file mode 100644 (file)

index 8a6a157..0000000
--- a/src/rgw/rgw_putobj_processor.cc
+++ /dev/null
@@ -1,704 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2018 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_aio.h"
-#include "rgw_putobj_processor.h"
-#include "rgw_multi.h"
-#include "rgw_compression.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_zone.h"
-#include "rgw_sal_rados.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-namespace rgw::putobj {
-
-int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
-{
-  const bool flush = (data.length() == 0);
-
-  // capture the first chunk for special handling
-  if (data_offset < head_chunk_size || data_offset == 0) {
-    if (flush) {
-      // flush partial chunk
-      return process_first_chunk(std::move(head_data), &processor);
-    }
-
-    auto remaining = head_chunk_size - data_offset;
-    auto count = std::min<uint64_t>(data.length(), remaining);
-    data.splice(0, count, &head_data);
-    data_offset += count;
-
-    if (data_offset == head_chunk_size) {
-      // process the first complete chunk
-      ceph_assert(head_data.length() == head_chunk_size);
-      int r = process_first_chunk(std::move(head_data), &processor);
-      if (r < 0) {
-        return r;
-      }
-    }
-    if (data.length() == 0) { // avoid flushing stripe processor
-      return 0;
-    }
-  }
-  ceph_assert(processor); // process_first_chunk() must initialize
-
-  // send everything else through the processor
-  auto write_offset = data_offset;
-  data_offset += data.length();
-  return processor->process(std::move(data), write_offset);
-}
-
-
-static int process_completed(const AioResultList& completed, RawObjSet *written)
-{
-  std::optional<int> error;
-  for (auto& r : completed) {
-    if (r.result >= 0) {
-      written->insert(r.obj.get_ref().obj);
-    } else if (!error) { // record first error code
-      error = r.result;
-    }
-  }
-  return error.value_or(0);
-}
-
-void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) {
-  const rgw_obj obj = head_obj->get_obj();
-  const RGWObjStateManifest *sm = obj_ctx.get_state(obj);
-  const bool compressed = sm->state.compressed;
-  uint32_t alloc_hint_flags = 0;
-  if (compressed) {
-    alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
-  }
-
-  op.set_alloc_hint2(0, 0, alloc_hint_flags);
-}
-
-int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
-{
-  stripe_obj = store->svc()->rados->obj(raw_obj);
-  return stripe_obj.open(dpp);
-}
-
-int RadosWriter::process(bufferlist&& bl, uint64_t offset)
-{
-  bufferlist data = std::move(bl);
-  const uint64_t cost = data.length();
-  if (cost == 0) { // no empty writes, use aio directly for creates
-    return 0;
-  }
-  librados::ObjectWriteOperation op;
-  add_write_hint(op);
-  if (offset == 0) {
-    op.write_full(data);
-  } else {
-    op.write(offset, data);
-  }
-  constexpr uint64_t id = 0; // unused
-  auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
-  return process_completed(c, &written);
-}
-
-int RadosWriter::write_exclusive(const bufferlist& data)
-{
-  const uint64_t cost = data.length();
-
-  librados::ObjectWriteOperation op;
-  op.create(true); // exclusive create
-  add_write_hint(op);
-  op.write_full(data);
-
-  constexpr uint64_t id = 0; // unused
-  auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
-  auto d = aio->drain();
-  c.splice(c.end(), d);
-  return process_completed(c, &written);
-}
-
-int RadosWriter::drain()
-{
-  return process_completed(aio->drain(), &written);
-}
-
-RadosWriter::~RadosWriter()
-{
-  // wait on any outstanding aio completions
-  process_completed(aio->drain(), &written);
-
-  bool need_to_remove_head = false;
-  std::optional<rgw_raw_obj> raw_head;
-  if (!rgw::sal::Object::empty(head_obj.get())) {
-    raw_head.emplace();
-    rgw::sal::RadosObject* obj = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get());
-    obj->get_raw_obj(&*raw_head);
-  }
-
-  /**
-   * We should delete the object in the "multipart" namespace to avoid race condition.
-   * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
-   * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
-   * written by the second upload may be deleted by the first upload.
-   * details is describled on #11749
-   *
-   * The above comment still stands, but instead of searching for a specific object in the multipart
-   * namespace, we just make sure that we remove the object that is marked as the head object after
-   * we remove all the other raw objects. Note that we use different call to remove the head object,
-   * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
-   */
-  for (const auto& obj : written) {
-    if (raw_head && obj == *raw_head) {
-      ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
-      need_to_remove_head = true;
-      continue;
-    }
-
-    int r = store->delete_raw_obj(dpp, obj);
-    if (r < 0 && r != -ENOENT) {
-      ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
-    }
-  }
-
-  if (need_to_remove_head) {
-    std::string version_id;
-    ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
-    int r = head_obj->delete_object(dpp, null_yield);
-    if (r < 0 && r != -ENOENT) {
-      ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
-    }
-  }
-}
-
-
-// advance to the next stripe
-int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
-{
-  // advance the manifest
-  int r = manifest_gen.create_next(offset);
-  if (r < 0) {
-    return r;
-  }
-
-  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-
-  uint64_t chunk_size = 0;
-  r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
-  if (r < 0) {
-    return r;
-  }
-  r = writer.set_stripe_obj(stripe_obj);
-  if (r < 0) {
-    return r;
-  }
-
-  chunk = ChunkProcessor(&writer, chunk_size);
-  *pstripe_size = manifest_gen.cur_stripe_max_size();
-  return 0;
-}
-
-
-
-int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
-                                               DataProcessor **processor)
-{
-  first_chunk = std::move(data);
-  *processor = &stripe;
-  return 0;
-}
-
-int AtomicObjectProcessor::prepare(optional_yield y)
-{
-  uint64_t max_head_chunk_size;
-  uint64_t head_max_size;
-  uint64_t chunk_size = 0;
-  uint64_t alignment;
-
-  int r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(
-                                      dpp, head_obj->get_bucket()->get_placement_rule(),
-                                      &max_head_chunk_size, &alignment);
-  if (r < 0) {
-    return r;
-  }
-
-  bool same_pool = true;
-  if (head_obj->get_bucket()->get_placement_rule() != tail_placement_rule) {
-    if (!head_obj->placement_rules_match(head_obj->get_bucket()->get_placement_rule(), tail_placement_rule)) {
-      same_pool = false;
-      r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(dpp, tail_placement_rule, &chunk_size);
-      if (r < 0) {
-        return r;
-      }
-      head_max_size = 0;
-    }
-  }
-
-  if (same_pool) {
-    RGWZonePlacementInfo placement_info;
-    if (!store->svc()->zone->get_zone_params().get_placement(head_obj->get_bucket()->get_placement_rule().name, &placement_info) || placement_info.inline_data) {
-      head_max_size = max_head_chunk_size;
-    } else {
-      head_max_size = 0;
-    }
-    chunk_size = max_head_chunk_size;
-  }
-
-  uint64_t stripe_size;
-  const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
-
-  dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_aligned_size(
-                                       default_stripe_size, alignment, &stripe_size);
-
-  manifest.set_trivial_rule(head_max_size, stripe_size);
-
-  rgw_obj obj = head_obj->get_obj();
-
-  r = manifest_gen.create_begin(store->ctx(), &manifest,
-                                head_obj->get_bucket()->get_placement_rule(),
-                                &tail_placement_rule,
-                                obj.bucket, obj);
-  if (r < 0) {
-    return r;
-  }
-
-  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-
-  r = writer.set_stripe_obj(stripe_obj);
-  if (r < 0) {
-    return r;
-  }
-
-  set_head_chunk_size(head_max_size);
-  // initialize the processors
-  chunk = ChunkProcessor(&writer, chunk_size);
-  stripe = StripeProcessor(&chunk, this, head_max_size);
-  return 0;
-}
-
-int AtomicObjectProcessor::complete(size_t accounted_size,
-                                    const std::string& etag,
-                                    ceph::real_time *mtime,
-                                    ceph::real_time set_mtime,
-                                    rgw::sal::Attrs& attrs,
-                                    ceph::real_time delete_at,
-                                    const char *if_match,
-                                    const char *if_nomatch,
-                                    const std::string *user_data,
-                                    rgw_zone_set *zones_trace,
-                                    bool *pcanceled, optional_yield y)
-{
-  int r = writer.drain();
-  if (r < 0) {
-    return r;
-  }
-  const uint64_t actual_size = get_actual_size();
-  r = manifest_gen.create_next(actual_size);
-  if (r < 0) {
-    return r;
-  }
-
-  head_obj->set_atomic();
-
-  RGWRados::Object op_target(store->getRados(),
-                 head_obj->get_bucket(),
-                 obj_ctx, head_obj.get());
-  RGWRados::Object::Write obj_op(&op_target);
-
-  /* some object types shouldn't be versioned, e.g., multipart parts */
-  op_target.set_versioning_disabled(!head_obj->get_bucket()->versioning_enabled());
-  obj_op.meta.data = &first_chunk;
-  obj_op.meta.manifest = &manifest;
-  obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
-  obj_op.meta.if_match = if_match;
-  obj_op.meta.if_nomatch = if_nomatch;
-  obj_op.meta.mtime = mtime;
-  obj_op.meta.set_mtime = set_mtime;
-  obj_op.meta.owner = owner;
-  obj_op.meta.flags = PUT_OBJ_CREATE;
-  obj_op.meta.olh_epoch = olh_epoch;
-  obj_op.meta.delete_at = delete_at;
-  obj_op.meta.user_data = user_data;
-  obj_op.meta.zones_trace = zones_trace;
-  obj_op.meta.modify_tail = true;
-
-  r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
-  if (r < 0) {
-    if (r == -ETIMEDOUT) {
-      // The head object write may eventually succeed, clear the set of objects for deletion. if it
-      // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write
-      writer.clear_written();
-    }
-    return r;
-  }
-  if (!obj_op.meta.canceled) {
-    // on success, clear the set of objects for deletion
-    writer.clear_written();
-  }
-  if (pcanceled) {
-    *pcanceled = obj_op.meta.canceled;
-  }
-  return 0;
-}
-
-
-int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
-                                                  DataProcessor **processor)
-{
-  // write the first chunk of the head object as part of an exclusive create,
-  // then drain to wait for the result in case of EEXIST
-  int r = writer.write_exclusive(data);
-  if (r == -EEXIST) {
-    // randomize the oid prefix and reprepare the head/manifest
-    std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
-
-    mp.init(target_obj->get_name(), upload_id, oid_rand);
-    manifest.set_prefix(target_obj->get_name() + "." + oid_rand);
-
-    r = prepare_head();
-    if (r < 0) {
-      return r;
-    }
-    // resubmit the write op on the new head object
-    r = writer.write_exclusive(data);
-  }
-  if (r < 0) {
-    return r;
-  }
-  *processor = &stripe;
-  return 0;
-}
-
-int MultipartObjectProcessor::prepare_head()
-{
-  const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
-  uint64_t chunk_size;
-  uint64_t stripe_size;
-  uint64_t alignment;
-
-  int r = dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_chunk_size(dpp,
-                                         tail_placement_rule, &chunk_size, &alignment);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
-    return r;
-  }
-  dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_aligned_size(
-                                       default_stripe_size, alignment, &stripe_size);
-
-  manifest.set_multipart_part_rule(stripe_size, part_num);
-
-  r = manifest_gen.create_begin(store->ctx(), &manifest,
-                               head_obj->get_bucket()->get_placement_rule(),
-                               &tail_placement_rule,
-                               target_obj->get_bucket()->get_key(),
-                               target_obj->get_obj());
-  if (r < 0) {
-    return r;
-  }
-
-  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-  dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->raw_obj_to_obj(stripe_obj);
-  head_obj->set_hash_source(target_obj->get_name());
-
-  r = writer.set_stripe_obj(stripe_obj);
-  if (r < 0) {
-    return r;
-  }
-  stripe_size = manifest_gen.cur_stripe_max_size();
-  set_head_chunk_size(stripe_size);
-
-  chunk = ChunkProcessor(&writer, chunk_size);
-  stripe = StripeProcessor(&chunk, this, stripe_size);
-  return 0;
-}
-
-int MultipartObjectProcessor::prepare(optional_yield y)
-{
-  manifest.set_prefix(target_obj->get_name() + "." + upload_id);
-
-  return prepare_head();
-}
-
-int MultipartObjectProcessor::complete(size_t accounted_size,
-                                       const std::string& etag,
-                                       ceph::real_time *mtime,
-                                       ceph::real_time set_mtime,
-                                       std::map<std::string, bufferlist>& attrs,
-                                       ceph::real_time delete_at,
-                                       const char *if_match,
-                                       const char *if_nomatch,
-                                       const std::string *user_data,
-                                       rgw_zone_set *zones_trace,
-                                       bool *pcanceled, optional_yield y)
-{
-  int r = writer.drain();
-  if (r < 0) {
-    return r;
-  }
-  const uint64_t actual_size = get_actual_size();
-  r = manifest_gen.create_next(actual_size);
-  if (r < 0) {
-    return r;
-  }
-
-  RGWRados::Object op_target(store->getRados(),
-                 head_obj->get_bucket(),
-                 obj_ctx, head_obj.get());
-  RGWRados::Object::Write obj_op(&op_target);
-
-  op_target.set_versioning_disabled(true);
-  op_target.set_meta_placement_rule(&tail_placement_rule);
-  obj_op.meta.set_mtime = set_mtime;
-  obj_op.meta.mtime = mtime;
-  obj_op.meta.owner = owner;
-  obj_op.meta.delete_at = delete_at;
-  obj_op.meta.zones_trace = zones_trace;
-  obj_op.meta.modify_tail = true;
-
-  r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
-  if (r < 0)
-    return r;
-
-  bufferlist bl;
-  RGWUploadPartInfo info;
-  string p = "part.";
-  bool sorted_omap = is_v2_upload_id(upload_id);
-
-  if (sorted_omap) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%08d", part_num);
-    p.append(buf);
-  } else {
-    p.append(part_num_str);
-  }
-  info.num = part_num;
-  info.etag = etag;
-  info.size = actual_size;
-  info.accounted_size = accounted_size;
-  info.modified = real_clock::now();
-  info.manifest = manifest;
-
-  bool compressed;
-  r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
-  if (r < 0) {
-    ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
-    return r;
-  }
-
-  encode(info, bl);
-
-  std::unique_ptr<rgw::sal::Object> meta_obj =
-    head_obj->get_bucket()->get_object(rgw_obj_key(mp.get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
-  meta_obj->set_in_extra_data(true);
-
-  r = meta_obj->omap_set_val_by_key(dpp, p, bl, true, null_yield);
-  if (r < 0) {
-    return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
-  }
-
-  if (!obj_op.meta.canceled) {
-    // on success, clear the set of objects for deletion
-    writer.clear_written();
-  }
-  if (pcanceled) {
-    *pcanceled = obj_op.meta.canceled;
-  }
-  return 0;
-}
-
-int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor)
-{
-  int r = writer.write_exclusive(data);
-  if (r < 0) {
-    return r;
-  }
-  *processor = &stripe;
-  return 0;
-}
-
-int AppendObjectProcessor::prepare(optional_yield y)
-{
-  RGWObjState *astate;
-  int r = head_obj->get_obj_state(dpp, &astate, y);
-  if (r < 0) {
-    return r;
-  }
-  cur_size = astate->size;
-  *cur_accounted_size = astate->accounted_size;
-  if (!astate->exists) {
-    if (position != 0) {
-      ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl;
-      return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
-    } else {
-      cur_part_num = 1;
-      //set the prefix
-      char buf[33];
-      gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
-      string oid_prefix = head_obj->get_name();
-      oid_prefix.append(".");
-      oid_prefix.append(buf);
-      oid_prefix.append("_");
-      manifest.set_prefix(oid_prefix);
-    }
-  } else {
-    // check whether the object appendable
-    map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
-    if (iter == astate->attrset.end()) {
-      ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl;
-      return -ERR_OBJECT_NOT_APPENDABLE;
-    }
-    if (position != *cur_accounted_size) {
-      ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl;
-      return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
-    }
-    try {
-      using ceph::decode;
-      decode(cur_part_num, iter->second);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl;
-      return -EIO;
-    }
-    cur_part_num++;
-    //get the current obj etag
-    iter = astate->attrset.find(RGW_ATTR_ETAG);
-    if (iter != astate->attrset.end()) {
-      string s = rgw_string_unquote(iter->second.c_str());
-      size_t pos = s.find("-");
-      cur_etag = s.substr(0, pos);
-    }
-
-    iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
-    if (iter != astate->attrset.end()) {
-      tail_placement_rule.storage_class = iter->second.to_str();
-    } else {
-      tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD;
-    }
-    cur_manifest = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_manifest();
-    manifest.set_prefix(cur_manifest->get_prefix());
-    astate->keep_tail = true;
-  }
-  manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
-
-  rgw_obj obj = head_obj->get_obj();
-
-  r = manifest_gen.create_begin(store->ctx(), &manifest, head_obj->get_bucket()->get_placement_rule(), &tail_placement_rule, obj.bucket, obj);
-  if (r < 0) {
-    return r;
-  }
-  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-
-  uint64_t chunk_size = 0;
-  r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
-  if (r < 0) {
-    return r;
-  }
-  r = writer.set_stripe_obj(std::move(stripe_obj));
-  if (r < 0) {
-    return r;
-  }
-
-  uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
-
-  uint64_t max_head_size = std::min(chunk_size, stripe_size);
-  set_head_chunk_size(max_head_size);
-
-  // initialize the processors
-  chunk = ChunkProcessor(&writer, chunk_size);
-  stripe = StripeProcessor(&chunk, this, stripe_size);
-
-  return 0;
-}
-
-int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
-                                    ceph::real_time set_mtime, rgw::sal::Attrs& attrs,
-                                    ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
-                                    const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled,
-                                    optional_yield y)
-{
-  int r = writer.drain();
-  if (r < 0)
-    return r;
-  const uint64_t actual_size = get_actual_size();
-  r = manifest_gen.create_next(actual_size);
-  if (r < 0) {
-    return r;
-  }
-  head_obj->set_atomic();
-  RGWRados::Object op_target(store->getRados(),
-                 head_obj->get_bucket(),
-                 obj_ctx, head_obj.get());
-  RGWRados::Object::Write obj_op(&op_target);
-  //For Append obj, disable versioning
-  op_target.set_versioning_disabled(true);
-  if (cur_manifest) {
-    cur_manifest->append(dpp, manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
-    obj_op.meta.manifest = cur_manifest;
-  } else {
-    obj_op.meta.manifest = &manifest;
-  }
-  obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
-  obj_op.meta.mtime = mtime;
-  obj_op.meta.set_mtime = set_mtime;
-  obj_op.meta.owner = owner;
-  obj_op.meta.flags = PUT_OBJ_CREATE;
-  obj_op.meta.delete_at = delete_at;
-  obj_op.meta.user_data = user_data;
-  obj_op.meta.zones_trace = zones_trace;
-  obj_op.meta.modify_tail = true;
-  obj_op.meta.appendable = true;
-  //Add the append part number
-  bufferlist cur_part_num_bl;
-  using ceph::encode;
-  encode(cur_part_num, cur_part_num_bl);
-  attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
-  //calculate the etag
-  if (!cur_etag.empty()) {
-    MD5 hash;
-    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-    char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
-    char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
-    char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
-    hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
-    hash.Update((const unsigned char *)petag, sizeof(petag));
-    hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
-    hash.Update((const unsigned char *)petag, sizeof(petag));
-    hash.Final((unsigned char *)final_etag);
-    buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
-    snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],  sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
-             "-%lld", (long long)cur_part_num);
-    bufferlist etag_bl;
-    etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
-    attrs[RGW_ATTR_ETAG] = etag_bl;
-  }
-  r = obj_op.write_meta(dpp, actual_size + cur_size,
-                       accounted_size + *cur_accounted_size,
-                       attrs, y);
-  if (r < 0) {
-    return r;
-  }
-  if (!obj_op.meta.canceled) {
-    // on success, clear the set of objects for deletion
-    writer.clear_written();
-  }
-  if (pcanceled) {
-    *pcanceled = obj_op.meta.canceled;
-  }
-  *cur_accounted_size += accounted_size;
-
-  return 0;
-}
-
-} // namespace rgw::putobj
diff --git a/src/rgw/rgw_putobj_processor.h b/src/rgw/rgw_putobj_processor.h

deleted file mode 100644 (file)

index 1beb9a7..0000000
--- a/src/rgw/rgw_putobj_processor.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2018 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <optional>
-
-#include "rgw_putobj.h"
-#include "services/svc_rados.h"
-#include "services/svc_tier_rados.h"
-#include "rgw_sal.h"
-#include "rgw_obj_manifest.h"
-
-namespace rgw {
-
-namespace sal {
-  class RadosStore;
-}
-
-class Aio;
-
-namespace putobj {
-
-// an object processor with special handling for the first chunk of the head.
-// the virtual process_first_chunk() function returns a processor to handle the
-// rest of the object
-class HeadObjectProcessor : public rgw::sal::ObjectProcessor {
-  uint64_t head_chunk_size;
-  // buffer to capture the first chunk of the head object
-  bufferlist head_data;
-  // initialized after process_first_chunk() to process everything else
-  rgw::sal::DataProcessor *processor = nullptr;
-  uint64_t data_offset = 0; // maximum offset of data written (ie compressed)
- protected:
-  uint64_t get_actual_size() const { return data_offset; }
-
-  // process the first chunk of data and return a processor for the rest
-  virtual int process_first_chunk(bufferlist&& data,
-                                  rgw::sal::DataProcessor **processor) = 0;
- public:
-  HeadObjectProcessor(uint64_t head_chunk_size)
-    : head_chunk_size(head_chunk_size)
-  {}
-
-  void set_head_chunk_size(uint64_t size) { head_chunk_size = size; }
-
-  // cache first chunk for process_first_chunk(), then forward everything else
-  // to the returned processor
-  int process(bufferlist&& data, uint64_t logical_offset) final override;
-};
-
-using RawObjSet = std::set<rgw_raw_obj>;
-
-// a data sink that writes to rados objects and deletes them on cancelation
-class RadosWriter : public rgw::sal::DataProcessor {
-  Aio *const aio;
-  rgw::sal::RadosStore *const store;
-  RGWObjectCtx& obj_ctx;
-  std::unique_ptr<rgw::sal::Object> head_obj;
-  RGWSI_RADOS::Obj stripe_obj; // current stripe object
-  RawObjSet written; // set of written objects for deletion
-  const DoutPrefixProvider *dpp;
-  optional_yield y;
-
- public:
-  RadosWriter(Aio *aio, rgw::sal::RadosStore *store,
-              RGWObjectCtx& obj_ctx, std::unique_ptr<rgw::sal::Object> _head_obj,
-              const DoutPrefixProvider *dpp, optional_yield y)
-    : aio(aio), store(store),
-      obj_ctx(obj_ctx), head_obj(std::move(_head_obj)), dpp(dpp), y(y)
-  {}
-  RadosWriter(RadosWriter&& r)
-    : aio(r.aio), store(r.store),
-      obj_ctx(r.obj_ctx), head_obj(std::move(r.head_obj)), dpp(r.dpp), y(r.y)
-  {}
-
-  ~RadosWriter();
-
-  // add alloc hint to osd
-  void add_write_hint(librados::ObjectWriteOperation& op);
-
-  // change the current stripe object
-  int set_stripe_obj(const rgw_raw_obj& obj);
-
-  // write the data at the given offset of the current stripe object
-  int process(bufferlist&& data, uint64_t stripe_offset) override;
-
-  // write the data as an exclusive create and wait for it to complete
-  int write_exclusive(const bufferlist& data);
-
-  int drain();
-
-  // when the operation completes successfully, clear the set of written objects
-  // so they aren't deleted on destruction
-  void clear_written() { written.clear(); }
-
-};
-
-
-// a rados object processor that stripes according to RGWObjManifest
-class ManifestObjectProcessor : public HeadObjectProcessor,
-                                public StripeGenerator {
- protected:
-  rgw::sal::RadosStore* const store;
-  rgw_placement_rule tail_placement_rule;
-  rgw_user owner;
-  RGWObjectCtx& obj_ctx;
-  std::unique_ptr<rgw::sal::Object> head_obj;
-
-  RadosWriter writer;
-  RGWObjManifest manifest;
-  RGWObjManifest::generator manifest_gen;
-  ChunkProcessor chunk;
-  StripeProcessor stripe;
-  const DoutPrefixProvider *dpp;
-
-  // implements StripeGenerator
-  int next(uint64_t offset, uint64_t *stripe_size) override;
-
- public:
-  ManifestObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
-                          const rgw_placement_rule *ptail_placement_rule,
-                          const rgw_user& owner, RGWObjectCtx& _obj_ctx,
-                          std::unique_ptr<rgw::sal::Object> _head_obj,
-                          const DoutPrefixProvider* dpp, optional_yield y)
-    : HeadObjectProcessor(0),
-      store(store),
-      owner(owner),
-      obj_ctx(_obj_ctx), head_obj(std::move(_head_obj)),
-      writer(aio, store, obj_ctx, head_obj->clone(), dpp, y),
-      chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) {
-        if (ptail_placement_rule) {
-          tail_placement_rule = *ptail_placement_rule;
-        }
-      }
-
-  void set_owner(const rgw_user& _owner) {
-    owner = _owner;
-  }
-
-  void set_tail_placement(const rgw_placement_rule& tpr) {
-    tail_placement_rule = tpr;
-  }
-  void set_tail_placement(const rgw_placement_rule&& tpr) {
-    tail_placement_rule = tpr;
-  }
-
-};
-
-
-// a processor that completes with an atomic write to the head object as part of
-// a bucket index transaction
-class AtomicObjectProcessor : public ManifestObjectProcessor {
-  const std::optional<uint64_t> olh_epoch;
-  const std::string unique_tag;
-  bufferlist first_chunk; // written with the head in complete()
-
-  int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
- public:
-  AtomicObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
-                        const rgw_placement_rule *ptail_placement_rule,
-                        const rgw_user& owner,
-                        RGWObjectCtx& obj_ctx,
-                       std::unique_ptr<rgw::sal::Object> _head_obj,
-                        std::optional<uint64_t> olh_epoch,
-                        const std::string& unique_tag,
-                        const DoutPrefixProvider *dpp, optional_yield y)
-    : ManifestObjectProcessor(aio, store, ptail_placement_rule,
-                              owner, obj_ctx, std::move(_head_obj), dpp, y),
-      olh_epoch(olh_epoch), unique_tag(unique_tag)
-  {}
-
-  // prepare a trivial manifest
-  int prepare(optional_yield y) override;
-  // write the head object atomically in a bucket index transaction
-  int complete(size_t accounted_size, const std::string& etag,
-               ceph::real_time *mtime, ceph::real_time set_mtime,
-               std::map<std::string, bufferlist>& attrs,
-               ceph::real_time delete_at,
-               const char *if_match, const char *if_nomatch,
-               const std::string *user_data,
-               rgw_zone_set *zones_trace, bool *canceled,
-               optional_yield y) override;
-
-};
-
-
-// a processor for multipart parts, which don't require atomic completion. the
-// part's head is written with an exclusive create to detect racing uploads of
-// the same part/upload id, which are restarted with a random oid prefix
-class MultipartObjectProcessor : public ManifestObjectProcessor {
-  std::unique_ptr<rgw::sal::Object> target_obj; // target multipart object
-  const std::string upload_id;
-  const int part_num;
-  const std::string part_num_str;
-  RGWMPObj mp;
-
-  // write the first chunk and wait on aio->drain() for its completion.
-  // on EEXIST, retry with random prefix
-  int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
-  // prepare the head stripe and manifest
-  int prepare_head();
- public:
-  MultipartObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
-                           const rgw_placement_rule *ptail_placement_rule,
-                           const rgw_user& owner, RGWObjectCtx& obj_ctx,
-                           std::unique_ptr<rgw::sal::Object> _head_obj,
-                           const std::string& upload_id, uint64_t part_num,
-                           const std::string& part_num_str,
-                           const DoutPrefixProvider *dpp, optional_yield y)
-    : ManifestObjectProcessor(aio, store, ptail_placement_rule,
-                              owner, obj_ctx, std::move(_head_obj), dpp, y),
-      target_obj(head_obj->clone()), upload_id(upload_id),
-      part_num(part_num), part_num_str(part_num_str),
-      mp(head_obj->get_name(), upload_id)
-  {}
-
-  // prepare a multipart manifest
-  int prepare(optional_yield y) override;
-  // write the head object attributes in a bucket index transaction, then
-  // register the completed part with the multipart meta object
-  int complete(size_t accounted_size, const std::string& etag,
-               ceph::real_time *mtime, ceph::real_time set_mtime,
-               std::map<std::string, bufferlist>& attrs,
-               ceph::real_time delete_at,
-               const char *if_match, const char *if_nomatch,
-               const std::string *user_data,
-               rgw_zone_set *zones_trace, bool *canceled,
-               optional_yield y) override;
-
-};
-
-  class AppendObjectProcessor : public ManifestObjectProcessor {
-    uint64_t cur_part_num;
-    uint64_t position;
-    uint64_t cur_size;
-    uint64_t *cur_accounted_size;
-    std::string cur_etag;
-    const std::string unique_tag;
-
-    RGWObjManifest *cur_manifest;
-
-    int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
-
-  public:
-    AppendObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
-                          const rgw_placement_rule *ptail_placement_rule,
-                          const rgw_user& owner, RGWObjectCtx& obj_ctx,
-                         std::unique_ptr<rgw::sal::Object> _head_obj,
-                          const std::string& unique_tag, uint64_t position,
-                          uint64_t *cur_accounted_size,
-                          const DoutPrefixProvider *dpp, optional_yield y)
-            : ManifestObjectProcessor(aio, store, ptail_placement_rule,
-                                      owner, obj_ctx, std::move(_head_obj), dpp, y),
-              position(position), cur_size(0), cur_accounted_size(cur_accounted_size),
-              unique_tag(unique_tag), cur_manifest(nullptr)
-    {}
-    int prepare(optional_yield y) override;
-    int complete(size_t accounted_size, const std::string& etag,
-                 ceph::real_time *mtime, ceph::real_time set_mtime,
-                 std::map<std::string, bufferlist>& attrs, ceph::real_time delete_at,
-                 const char *if_match, const char *if_nomatch, const std::string *user_data,
-                 rgw_zone_set *zones_trace, bool *canceled,
-                 optional_yield y) override;
-  };
-
-} // namespace putobj
-} // namespace rgw
-
diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h

index b97e83a69b13cc8f663834045d41b8fd921109b3..632cb48171b2532712f8f7b67f8a57d0ef5d96e4 100644 (file)
--- a/src/rgw/rgw_quota.h
+++ b/src/rgw/rgw_quota.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef CEPH_RGW_QUOTA_H
-#define CEPH_RGW_QUOTA_H
+#pragma once
  
  #include "include/utime.h"
  #include "common/config_fwd.h"
@@ -48,5 +47,3 @@ public:
  // apply default quotas from configuration
  void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf);
  void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf);
-
-#endif
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc

deleted file mode 100644 (file)

index 6779e51..0000000
--- a/src/rgw/rgw_rados.cc
+++ /dev/null
@@ -1,9715 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "include/compat.h"
-#include <errno.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sstream>
-
-#include <boost/algorithm/string.hpp>
-#include <string_view>
-
-#include <boost/container/flat_set.hpp>
-#include <boost/format.hpp>
-#include <boost/optional.hpp>
-#include <boost/utility/in_place_factory.hpp>
-
-#include "common/ceph_json.h"
-
-#include "common/errno.h"
-#include "common/Formatter.h"
-#include "common/Throttle.h"
-#include "common/BackTrace.h"
-
-#include "rgw_sal.h"
-#include "rgw_zone.h"
-#include "rgw_cache.h"
-#include "rgw_acl.h"
-#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
-#include "rgw_aio_throttle.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_rest_conn.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-#include "rgw_datalog.h"
-#include "rgw_putobj_processor.h"
-
-#include "cls/rgw/cls_rgw_ops.h"
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/rgw/cls_rgw_const.h"
-#include "cls/refcount/cls_refcount_client.h"
-#include "cls/version/cls_version_client.h"
-#include "osd/osd_types.h"
-
-#include "rgw_tools.h"
-#include "rgw_coroutine.h"
-#include "rgw_compression.h"
-#include "rgw_etag_verifier.h"
-#include "rgw_worker.h"
-#include "rgw_notify.h"
-#include "rgw_http_errors.h"
-
-#undef fork // fails to compile RGWPeriod::fork() below
-
-#include "common/Clock.h"
-
-#include <string>
-#include <iostream>
-#include <vector>
-#include <atomic>
-#include <list>
-#include <map>
-#include "include/random.h"
-
-#include "rgw_gc.h"
-#include "rgw_lc.h"
-
-#include "rgw_object_expirer_core.h"
-#include "rgw_sync.h"
-#include "rgw_sync_counters.h"
-#include "rgw_sync_trace.h"
-#include "rgw_trim_datalog.h"
-#include "rgw_trim_mdlog.h"
-#include "rgw_data_sync.h"
-#include "rgw_realm_watcher.h"
-#include "rgw_reshard.h"
-#include "rgw_cr_rados.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_zone_utils.h"
-#include "services/svc_quota.h"
-#include "services/svc_sync_modules.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_sys_obj_cache.h"
-#include "services/svc_bucket.h"
-#include "services/svc_mdlog.h"
-
-#include "compressor/Compressor.h"
-
-#include "rgw_d3n_datacache.h"
-
-#ifdef WITH_LTTNG
-#define TRACEPOINT_DEFINE
-#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
-#include "tracing/rgw_rados.h"
-#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
-#undef TRACEPOINT_DEFINE
-#else
-#define tracepoint(...)
-#endif
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-using namespace librados;
-
-#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: "
-#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: "
-#define dendl_bitx                      dendl ; }
-
-static string shadow_ns = "shadow";
-static string default_bucket_index_pool_suffix = "rgw.buckets.index";
-static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
-
-static RGWObjCategory main_category = RGWObjCategory::Main;
-#define RGW_USAGE_OBJ_PREFIX "usage."
-
-rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* driver) const
-{
-  if (!is_raw) {
-    rgw_raw_obj r;
-    driver->get_raw_obj(placement_rule, obj, &r);
-    return r;
-  }
-  return raw_obj;
-}
-
-void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
-{
-  obj_version* check_objv = version_for_check();
-
-  if (check_objv) {
-    cls_version_check(*op, *check_objv, VER_COND_EQ);
-  }
-
-  cls_version_read(*op, &read_version);
-}
-
-void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
-{
-  obj_version* check_objv = version_for_check();
-  obj_version* modify_version = version_for_write();
-
-  if (check_objv) {
-    cls_version_check(*op, *check_objv, VER_COND_EQ);
-  }
-
-  if (modify_version) {
-    cls_version_set(*op, *modify_version);
-  } else {
-    cls_version_inc(*op);
-  }
-}
-
-void RGWObjVersionTracker::apply_write()
-{
-  const bool checked = (read_version.ver != 0);
-  const bool incremented = (write_version.ver == 0);
-
-  if (checked && incremented) {
-    // apply cls_version_inc() so our next operation can recheck it
-    ++read_version.ver;
-  } else {
-    read_version = write_version;
-  }
-  write_version = obj_version();
-}
-
-RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) {
-  RGWObjStateManifest *result;
-  typename std::map<rgw_obj, RGWObjStateManifest>::iterator iter;
-  lock.lock_shared();
-  assert (!obj.empty());
-  iter = objs_state.find(obj);
-  if (iter != objs_state.end()) {
-    result = &iter->second;
-    lock.unlock_shared();
-  } else {
-    lock.unlock_shared();
-    lock.lock();
-    result = &objs_state[obj];
-    lock.unlock();
-  }
-  return result;
-}
-
-void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
-  std::unique_lock wl{lock};
-  assert (!obj.empty());
-  objs_state[obj].state.compressed = true;
-}
-
-void RGWObjectCtx::set_atomic(rgw_obj& obj) {
-  std::unique_lock wl{lock};
-  assert (!obj.empty());
-  objs_state[obj].state.is_atomic = true;
-}
-void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
-  std::unique_lock wl{lock};
-  assert (!obj.empty());
-  objs_state[obj].state.prefetch_data = true;
-}
-
-void RGWObjectCtx::invalidate(const rgw_obj& obj) {
-  std::unique_lock wl{lock};
-  auto iter = objs_state.find(obj);
-  if (iter == objs_state.end()) {
-    return;
-  }
-  bool is_atomic = iter->second.state.is_atomic;
-  bool prefetch_data = iter->second.state.prefetch_data;
-  bool compressed = iter->second.state.compressed;
-
-  objs_state.erase(iter);
-
-  if (is_atomic || prefetch_data) {
-    auto& sm = objs_state[obj];
-    sm.state.is_atomic = is_atomic;
-    sm.state.prefetch_data = prefetch_data;
-    sm.state.compressed = compressed;
-  }
-}
-
-class RGWMetaNotifierManager : public RGWCoroutinesManager {
-  RGWRados* store;
-  RGWHTTPManager http_manager;
-
-public:
-  RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
-                                             http_manager(store->ctx(), completion_mgr) {
-    http_manager.start();
-  }
-
-  int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
-    rgw_http_param_pair pairs[] = { { "type", "metadata" },
-                                    { "notify", NULL },
-                                    { NULL, NULL } };
-
-    list<RGWCoroutinesStack *> stacks;
-    for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
-      RGWRESTConn *conn = iter->second;
-      RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
-      stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
-
-      stacks.push_back(stack);
-    }
-    return run(dpp, stacks);
-  }
-};
-
-class RGWDataNotifierManager : public RGWCoroutinesManager {
-  RGWRados* store;
-  RGWHTTPManager http_manager;
-
-public:
-  RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
-                                             http_manager(store->ctx(), completion_mgr) {
-    http_manager.start();
-  }
-
-  int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
-               bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards) {
-
-    list<RGWCoroutinesStack *> stacks;
-    const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str();
-    for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
-      RGWRESTConn *conn = iter->second;
-      RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
-      stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn));
-      stacks.push_back(stack);
-    }
-
-    return run(dpp, stacks);
-  }
-};
-
-/* class RGWRadosThread */
-
-void RGWRadosThread::start()
-{
-  worker = new Worker(cct, this);
-  worker->create(thread_name.c_str());
-}
-
-void RGWRadosThread::stop()
-{
-  down_flag = true;
-  stop_process();
-  if (worker) {
-    worker->signal();
-    worker->join();
-  }
-  delete worker;
-  worker = NULL;
-}
-
-void *RGWRadosThread::Worker::entry() {
-  uint64_t msec = processor->interval_msec();
-  auto interval = std::chrono::milliseconds(msec);
-
-  do {
-    auto start = ceph::real_clock::now();
-    int r = processor->process(this);
-    if (r < 0) {
-      ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
-    }
-
-    if (processor->going_down())
-      break;
-
-    auto end = ceph::real_clock::now() - start;
-
-    uint64_t cur_msec = processor->interval_msec();
-    if (cur_msec != msec) { /* was it reconfigured? */
-      msec = cur_msec;
-      interval = std::chrono::milliseconds(msec);
-    }
-
-    if (cur_msec > 0) {
-      if (interval <= end)
-        continue; // next round
-
-      auto wait_time = interval - end;
-      wait_interval(wait_time);
-    } else {
-      wait();
-    }
-  } while (!processor->going_down());
-
-  return NULL;
-}
-
-class RGWMetaNotifier : public RGWRadosThread {
-  RGWMetaNotifierManager notify_mgr;
-  RGWMetadataLog *const log;
-
-  uint64_t interval_msec() override {
-    return cct->_conf->rgw_md_notify_interval_msec;
-  }
-  void stop_process() override {
-    notify_mgr.stop();
-  }
-public:
-  RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log)
-    : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {}
-
-  int process(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
-{
-  set<int> shards;
-
-  log->read_clear_modified(shards);
-
-  if (shards.empty()) {
-    return 0;
-  }
-
-  for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
-    ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
-  }
-
-  notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
-
-  return 0;
-}
-
-class RGWDataNotifier : public RGWRadosThread {
-  RGWDataNotifierManager notify_mgr;
-  bc::flat_set<rgw_data_notify_entry> entry;
-
-  uint64_t interval_msec() override {
-    return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
-  }
-  void stop_process() override {
-    notify_mgr.stop();
-  }
-public:
-  RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {}
-
-  int process(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
-{
-  auto data_log = store->svc.datalog_rados;
-  if (!data_log) {
-    return 0;
-  }
-
-  auto shards = data_log->read_clear_modified();
-
-  if (shards.empty()) {
-    return 0;
-  }
-
-  for (const auto& [shard_id, entries] : shards) {
-    bc::flat_set<rgw_data_notify_entry>::iterator it;
-    for (const auto& entry : entries) {
-      ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
-        << shard_id << ":" << entry.gen << ":" << entry.key << dendl;
-    }
-  }
-
-  notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
-
-  return 0;
-}
-
-class RGWSyncProcessorThread : public RGWRadosThread {
-public:
-  RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {}
-  RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {}
-  ~RGWSyncProcessorThread() override {}
-  int init(const DoutPrefixProvider *dpp) override = 0 ;
-  int process(const DoutPrefixProvider *dpp) override = 0;
-};
-
-class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
-{
-  RGWMetaSyncStatusManager sync;
-
-  uint64_t interval_msec() override {
-    return 0; /* no interval associated, it'll run once until stopped */
-  }
-  void stop_process() override {
-    sync.stop();
-  }
-public:
-  RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados)
-    : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {}
-
-  void wakeup_sync_shards(set<int>& shard_ids) {
-    for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
-      sync.wakeup(*iter);
-    }
-  }
-  RGWMetaSyncStatusManager* get_manager() { return &sync; }
-
-  int init(const DoutPrefixProvider *dpp) override {
-    int ret = sync.init(dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
-      return ret;
-    }
-    return 0;
-  }
-
-  int process(const DoutPrefixProvider *dpp) override {
-    sync.run(dpp, null_yield);
-    return 0;
-  }
-};
-
-class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
-{
-  PerfCountersRef counters;
-  RGWDataSyncStatusManager sync;
-  bool initialized;
-
-  uint64_t interval_msec() override {
-    if (initialized) {
-      return 0; /* no interval associated, it'll run once until stopped */
-    } else {
-#define DATA_SYNC_INIT_WAIT_SEC 20
-      return DATA_SYNC_INIT_WAIT_SEC * 1000;
-    }
-  }
-  void stop_process() override {
-    sync.stop();
-  }
-public:
-  RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
-                             const RGWZone* source_zone)
-    : RGWSyncProcessorThread(_driver->getRados(), "data-sync"),
-      counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
-      sync(_driver, async_rados, source_zone->id, counters.get()),
-      initialized(false) {}
-
-  void wakeup_sync_shards(bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries) {
-    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
-      sync.wakeup(iter->first, iter->second);
-    }
-  }
-
-  RGWDataSyncStatusManager* get_manager() { return &sync; }
-
-  int init(const DoutPrefixProvider *dpp) override {
-    return 0;
-  }
-
-  int process(const DoutPrefixProvider *dpp) override {
-    while (!initialized) {
-      if (going_down()) {
-        return 0;
-      }
-      int ret = sync.init(dpp);
-      if (ret >= 0) {
-        initialized = true;
-        break;
-      }
-      /* we'll be back! */
-      return 0;
-    }
-    sync.run(dpp);
-    return 0;
-  }
-};
-
-class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
-{
-  RGWCoroutinesManager crs;
-  rgw::sal::RadosStore* store;
-  rgw::BucketTrimManager *bucket_trim;
-  RGWHTTPManager http;
-  const utime_t trim_interval;
-
-  uint64_t interval_msec() override { return 0; }
-  void stop_process() override { crs.stop(); }
-public:
-  RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
-                       int interval)
-    : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
-      crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
-      bucket_trim(bucket_trim),
-      http(store->ctx(), crs.get_completion_mgr()),
-      trim_interval(interval, 0)
-  {}
-
-  int init(const DoutPrefixProvider *dpp) override {
-    return http.start();
-  }
-  int process(const DoutPrefixProvider *dpp) override {
-    list<RGWCoroutinesStack*> stacks;
-    auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
-                                             cct->_conf->rgw_md_log_max_shards,
-                                             trim_interval);
-    if (!metatrimcr) {
-      ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
-      return -EINVAL;
-    }
-    auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
-    meta->call(metatrimcr);
-
-    stacks.push_back(meta);
-
-    if (store->svc()->zone->sync_module_exports_data()) {
-      auto data = new RGWCoroutinesStack(store->ctx(), &crs);
-      data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
-                                         cct->_conf->rgw_data_log_num_shards,
-                                         trim_interval));
-      stacks.push_back(data);
-
-      auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
-      bucket->call(bucket_trim->create_bucket_trim_cr(&http));
-      stacks.push_back(bucket);
-    }
-
-    crs.run(dpp, stacks);
-    return 0;
-  }
-
-  // implements DoutPrefixProvider
-  CephContext *get_cct() const override { return store->ctx(); }
-  unsigned get_subsys() const override
-  {
-    return dout_subsys;
-  }
-
-  std::ostream& gen_prefix(std::ostream& out) const override
-  {
-    return out << "sync log trim: ";
-  }
-
-};
-
-void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
-{
-  std::lock_guard l{meta_sync_thread_lock};
-  if (meta_sync_processor_thread) {
-    meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
-  }
-}
-
-void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries)
-{
-  ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl;
-  for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
-    ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
-    bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
-    for (const auto& [key, gen] : entries) {
-      ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key
-                        << ", gen=" << gen << dendl;
-    }
-  }
-
-  std::lock_guard l{data_sync_thread_lock};
-  auto iter = data_sync_processor_threads.find(source_zone);
-  if (iter == data_sync_processor_threads.end()) {
-    ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
-    return;
-  }
-
-  RGWDataSyncProcessorThread *thread = iter->second;
-  ceph_assert(thread);
-  thread->wakeup_sync_shards(entries);
-}
-
-RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
-{
-  std::lock_guard l{meta_sync_thread_lock};
-  if (meta_sync_processor_thread) {
-    return meta_sync_processor_thread->get_manager();
-  }
-  return nullptr;
-}
-
-RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
-{
-  std::lock_guard l{data_sync_thread_lock};
-  auto thread = data_sync_processor_threads.find(source_zone);
-  if (thread == data_sync_processor_threads.end()) {
-    return nullptr;
-  }
-  return thread->second->get_manager();
-}
-
-int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
-{
-  IoCtx ioctx;
-  int r = open_pool_ctx(dpp, pool, ioctx, false);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
-    return r;
-  }
-
-  bool req;
-  r = ioctx.pool_requires_alignment2(&req);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
-      << r << dendl;
-    return r;
-  }
-
-  if (!req) {
-    *alignment = 0;
-    return 0;
-  }
-
-  uint64_t align;
-  r = ioctx.pool_required_alignment2(&align);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
-      << r << dendl;
-    return r;
-  }
-  if (align != 0) {
-    ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
-  }
-  *alignment = align;
-  return 0;
-}
-
-void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
-{
-  if (alignment == 0) {
-    *max_size = size;
-    return;
-  }
-
-  if (size <= alignment) {
-    *max_size = alignment;
-    return;
-  }
-
-  *max_size = size - (size % alignment);
-}
-
-int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
-{
-  uint64_t alignment;
-  int r = get_required_alignment(dpp, pool, &alignment);
-  if (r < 0) {
-    return r;
-  }
-
-  if (palignment) {
-    *palignment = alignment;
-  }
-
-  uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
-
-  get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
-
-  ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
-
-  return 0;
-}
-
-int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
-                                 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
-{
-  rgw_pool pool;
-  if (!get_obj_data_pool(placement_rule, obj, &pool)) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
-    return -EIO;
-  }
-  return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
-}
-
-void add_datalog_entry(const DoutPrefixProvider* dpp,
-                       RGWDataChangesLog* datalog,
-                       const RGWBucketInfo& bucket_info,
-                       uint32_t shard_id)
-{
-  const auto& logs = bucket_info.layout.logs;
-  if (logs.empty()) {
-    return;
-  }
-  int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
-  } // datalog error is not fatal
-}
-
-class RGWIndexCompletionManager;
-
-struct complete_op_data {
-  ceph::mutex lock = ceph::make_mutex("complete_op_data");
-  AioCompletion *rados_completion{nullptr};
-  int manager_shard_id{-1};
-  RGWIndexCompletionManager *manager{nullptr};
-  rgw_obj obj;
-  RGWModifyOp op;
-  string tag;
-  rgw_bucket_entry_ver ver;
-  cls_rgw_obj_key key;
-  rgw_bucket_dir_entry_meta dir_meta;
-  list<cls_rgw_obj_key> remove_objs;
-  bool log_op;
-  uint16_t bilog_op;
-  rgw_zone_set zones_trace;
-
-  bool stopped{false};
-
-  void stop() {
-    std::lock_guard l{lock};
-    stopped = true;
-  }
-};
-
-class RGWIndexCompletionManager {
-  RGWRados* const store;
-  const uint32_t num_shards;
-  ceph::containers::tiny_vector<ceph::mutex> locks;
-  std::vector<set<complete_op_data*>> completions;
-  std::vector<complete_op_data*> retry_completions;
-
-  std::condition_variable cond;
-  std::mutex retry_completions_lock;
-  bool _stop{false};
-  std::thread retry_thread;
-
-  // used to distribute the completions and the locks they use across
-  // their respective vectors; it will get incremented and can wrap
-  // around back to 0 without issue
-  std::atomic<uint32_t> cur_shard {0};
-
-  void process();
-  
-  void add_completion(complete_op_data *completion);
-  
-  void stop() {
-    if (retry_thread.joinable()) {
-      _stop = true;
-      cond.notify_all();
-      retry_thread.join();
-    }
-
-    for (uint32_t i = 0; i < num_shards; ++i) {
-      std::lock_guard l{locks[i]};
-      for (auto c : completions[i]) {
-        c->stop();
-      }
-    }
-    completions.clear();
-  }
-  
-  uint32_t next_shard() {
-    return cur_shard++ % num_shards;
-  }
-
-public:
-  RGWIndexCompletionManager(RGWRados *_driver) :
-    store(_driver),
-    num_shards(store->ctx()->_conf->rgw_thread_pool_size),
-    locks{ceph::make_lock_container<ceph::mutex>(
-      num_shards,
-      [](const size_t i) {
-        return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
-                               std::to_string(i));
-      })},
-    completions(num_shards),
-    retry_thread(&RGWIndexCompletionManager::process, this)
-    {}
-
-  ~RGWIndexCompletionManager() {
-    stop();
-  }
-
-  void create_completion(const rgw_obj& obj,
-                         RGWModifyOp op, string& tag,
-                         rgw_bucket_entry_ver& ver,
-                         const cls_rgw_obj_key& key,
-                         rgw_bucket_dir_entry_meta& dir_meta,
-                         list<cls_rgw_obj_key> *remove_objs, bool log_op,
-                         uint16_t bilog_op,
-                         rgw_zone_set *zones_trace,
-                         complete_op_data **result);
-
-  bool handle_completion(completion_t cb, complete_op_data *arg);
-
-  CephContext* ctx() {
-    return store->ctx();
-  }
-};
-
-static void obj_complete_cb(completion_t cb, void *arg)
-{
-  complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
-  completion->lock.lock();
-  if (completion->stopped) {
-    completion->lock.unlock(); /* can drop lock, no one else is referencing us */
-    delete completion;
-    return;
-  }
-  bool need_delete = completion->manager->handle_completion(cb, completion);
-  completion->lock.unlock();
-  if (need_delete) {
-    delete completion;
-  }
-}
-
-void RGWIndexCompletionManager::process()
-{
-  DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
-  while(!_stop) {
-    std::vector<complete_op_data*> comps;
-
-    {
-      std::unique_lock l{retry_completions_lock};
-      cond.wait(l, [this](){return _stop || !retry_completions.empty();});
-      if (_stop) {
-        return;
-      }
-      retry_completions.swap(comps);
-    }
-
-    for (auto c : comps) {
-      std::unique_ptr<complete_op_data> up{c};
-
-      ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
-
-      RGWRados::BucketShard bs(store);
-      RGWBucketInfo bucket_info;
-
-      int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
-      if (r < 0) {
-        ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
-        /* not much to do */
-        continue;
-      }
-
-      r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
-                            [&](RGWRados::BucketShard *bs) -> int {
-                              const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation;
-                              ldout_bitx(bitx, &dpp, 10) <<
-                                "ENTERING " << __func__ << ": bucket-shard=" << bs <<
-                                " obj=" << c->obj << " tag=" << c->tag <<
-                                " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx;
-                              ldout_bitx(bitx, &dpp, 25) <<
-                                "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx;
-
-                              librados::ObjectWriteOperation o;
-                              o.assert_exists();
-                              cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
-                              cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
-                                                         c->log_op, c->bilog_op, &c->zones_trace);
-                              int ret = bs->bucket_obj.operate(&dpp, &o, null_yield);
-                              ldout_bitx(bitx, &dpp, 10) <<
-                                "EXITING " << __func__ << ": ret=" << dendl_bitx;
-                              return ret;
-                             });
-      if (r < 0) {
-        ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
-        /* ignoring error, can't do anything about it */
-        continue;
-      }
-
-      add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id);
-    }
-  }
-}
-
-void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
-                                                  RGWModifyOp op, string& tag,
-                                                  rgw_bucket_entry_ver& ver,
-                                                  const cls_rgw_obj_key& key,
-                                                  rgw_bucket_dir_entry_meta& dir_meta,
-                                                  list<cls_rgw_obj_key> *remove_objs, bool log_op,
-                                                  uint16_t bilog_op,
-                                                  rgw_zone_set *zones_trace,
-                                                  complete_op_data **result)
-{
-  complete_op_data *entry = new complete_op_data;
-
-  int shard_id = next_shard();
-
-  entry->manager_shard_id = shard_id;
-  entry->manager = this;
-  entry->obj = obj;
-  entry->op = op;
-  entry->tag = tag;
-  entry->ver = ver;
-  entry->key = key;
-  entry->dir_meta = dir_meta;
-  entry->log_op = log_op;
-  entry->bilog_op = bilog_op;
-
-  if (remove_objs) {
-    for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
-      entry->remove_objs.push_back(*iter);
-    }
-  }
-
-  if (zones_trace) {
-    entry->zones_trace = *zones_trace;
-  } else {
-    entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
-  }
-
-  *result = entry;
-
-  entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
-
-  std::lock_guard l{locks[shard_id]};
-  const auto ok = completions[shard_id].insert(entry).second;
-  ceph_assert(ok);
-}
-
-void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
-  {
-    std::lock_guard l{retry_completions_lock};
-    retry_completions.push_back(completion);
-  }
-  cond.notify_all();
-}
-
-bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
-{
-  int shard_id = arg->manager_shard_id;
-  {
-    std::lock_guard l{locks[shard_id]};
-
-    auto& comps = completions[shard_id];
-
-    auto iter = comps.find(arg);
-    if (iter == comps.end()) {
-      ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
-      return true;
-    }
-
-    comps.erase(iter);
-  }
-
-  int r = rados_aio_get_return_value(cb);
-  if (r != -ERR_BUSY_RESHARDING) {
-    ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " << 
-      (r == 0 ? "ok" : "failed with " + to_string(r)) << 
-      " for obj=" << arg->key << dendl;
-    return true;
-  }
-  add_completion(arg);
-  ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
-  return false;
-}
-
-void RGWRados::finalize()
-{
-  /* Before joining any sync threads, drain outstanding requests &
-   * mark the async_processor as going_down() */
-  if (svc.rados) {
-    svc.rados->stop_processor();
-  }
-
-  if (run_sync_thread) {
-    std::lock_guard l{meta_sync_thread_lock};
-    meta_sync_processor_thread->stop();
-
-    std::lock_guard dl{data_sync_thread_lock};
-    for (auto iter : data_sync_processor_threads) {
-      RGWDataSyncProcessorThread *thread = iter.second;
-      thread->stop();
-    }
-    if (sync_log_trimmer) {
-      sync_log_trimmer->stop();
-    }
-  }
-  if (run_sync_thread) {
-    delete meta_sync_processor_thread;
-    meta_sync_processor_thread = NULL;
-    std::lock_guard dl{data_sync_thread_lock};
-    for (auto iter : data_sync_processor_threads) {
-      RGWDataSyncProcessorThread *thread = iter.second;
-      delete thread;
-    }
-    data_sync_processor_threads.clear();
-    delete sync_log_trimmer;
-    sync_log_trimmer = nullptr;
-    bucket_trim = boost::none;
-  }
-  if (meta_notifier) {
-    meta_notifier->stop();
-    delete meta_notifier;
-  }
-  if (data_notifier) {
-    data_notifier->stop();
-    delete data_notifier;
-  }
-  delete sync_tracer;
-  
-  delete lc;
-  lc = NULL; 
-
-  delete gc;
-  gc = NULL;
-
-  delete obj_expirer;
-  obj_expirer = NULL;
-
-  RGWQuotaHandler::free_handler(quota_handler);
-  if (cr_registry) {
-    cr_registry->put();
-  }
-
-  svc.shutdown();
-
-  delete binfo_cache;
-  delete obj_tombstone_cache;
-  if (d3n_data_cache)
-    delete d3n_data_cache;
-
-  if (reshard_wait.get()) {
-    reshard_wait->stop();
-    reshard_wait.reset();
-  }
-
-  if (run_reshard_thread) {
-    reshard->stop_processor();
-  }
-  delete reshard;
-  delete index_completion_manager;
-
-  rgw::notify::shutdown();
-}
-
-/** 
- * Initialize the RADOS instance and prepare to do other ops
- * Returns 0 on success, -ERR# on failure.
- */
-int RGWRados::init_rados()
-{
-  int ret = 0;
-
-  ret = rados.init_with_context(cct);
-  if (ret < 0) {
-    return ret;
-  }
-  ret = rados.connect();
-  if (ret < 0) {
-    return ret;
-  }
-
-  auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
-    new RGWCoroutinesManagerRegistry(cct)};
-  ret = crs->hook_to_admin_command("cr dump");
-  if (ret < 0) {
-    return ret;
-  }
-
-  cr_registry = crs.release();
-
-  if (use_datacache) {
-    d3n_data_cache = new D3nDataCache();
-    d3n_data_cache->init(cct);
-  }
-
-  return ret;
-}
-
-int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
-{
-  string name = cct->_conf->name.get_id();
-  if (name.compare(0, 4, "rgw.") == 0) {
-    name = name.substr(4);
-  }
-  map<string,string> metadata = meta;
-  metadata["num_handles"] = "1"s;
-  metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
-  metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
-  metadata["zone_name"] = svc.zone->zone_name();
-  metadata["zone_id"] = svc.zone->zone_id().id;
-  metadata["realm_name"] = svc.zone->get_realm().get_name();
-  metadata["realm_id"] = svc.zone->get_realm().get_id();
-  metadata["id"] = name;
-  int ret = rados.service_daemon_register(
-    daemon_type,
-    stringify(rados.get_instance_id()),
-    metadata);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
-{
-  int ret = rados.service_daemon_update_status(move(status));
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-/** 
- * Initialize the RADOS instance and prepare to do other ops
- * Returns 0 on success, -ERR# on failure.
- */
-int RGWRados::init_complete(const DoutPrefixProvider *dpp)
-{
-  int ret;
-
-  /* 
-   * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
-   */
-  sync_module = svc.sync_modules->get_sync_module();
-
-  ret = open_root_pool_ctx(dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = open_gc_pool_ctx(dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = open_lc_pool_ctx(dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = open_objexp_pool_ctx(dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = open_reshard_pool_ctx(dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = open_notif_pool_ctx(dpp);
-  if (ret < 0)
-    return ret;
-
-  pools_initialized = true;
-
-  if (use_gc) {
-    gc = new RGWGC();
-    gc->initialize(cct, this);
-  } else {
-    ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
-  }
-
-  obj_expirer = new RGWObjectExpirer(this->driver);
-
-  if (use_gc_thread && use_gc) {
-    gc->start_processor();
-    obj_expirer->start_processor();
-  }
-
-  auto& current_period = svc.zone->get_current_period();
-  auto& zonegroup = svc.zone->get_zonegroup();
-  auto& zone_params = svc.zone->get_zone_params();
-  auto& zone = svc.zone->get_zone();
-
-  /* no point of running sync thread if we don't have a master zone configured
-    or there is no rest_master_conn */
-  if (!svc.zone->need_to_sync()) {
-    run_sync_thread = false;
-  }
-
-  if (svc.zone->is_meta_master()) {
-    auto md_log = svc.mdlog->get_log(current_period.get_id());
-    meta_notifier = new RGWMetaNotifier(this, md_log);
-    meta_notifier->start();
-  }
-
-  /* init it anyway, might run sync through radosgw-admin explicitly */
-  sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
-  sync_tracer->init(this);
-  ret = sync_tracer->hook_to_admin_command();
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (run_sync_thread) {
-    for (const auto &pt: zonegroup.placement_targets) {
-      if (zone_params.placement_pools.find(pt.second.name)
-          == zone_params.placement_pools.end()){
-        ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
-                      << pt.second.name << " present in zonegroup" << dendl;
-      }
-    }
-    auto async_processor = svc.rados->get_async_processor();
-    std::lock_guard l{meta_sync_thread_lock};
-    meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor);
-    ret = meta_sync_processor_thread->init(dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
-      return ret;
-    }
-    meta_sync_processor_thread->start();
-
-    // configure the bucket trim manager
-    rgw::BucketTrimConfig config;
-    rgw::configure_bucket_trim(cct, config);
-
-    bucket_trim.emplace(this->driver, config);
-    ret = bucket_trim->init();
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
-      return ret;
-    }
-    svc.datalog_rados->set_observer(&*bucket_trim);
-
-    std::lock_guard dl{data_sync_thread_lock};
-    for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
-      ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
-      auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone);
-      ret = thread->init(dpp);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
-        return ret;
-      }
-      thread->start();
-      data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
-    }
-    auto interval = cct->_conf->rgw_sync_log_trim_interval;
-    if (interval > 0) {
-      sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval);
-      ret = sync_log_trimmer->init(dpp);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
-        return ret;
-      }
-      sync_log_trimmer->start();
-    }
-  }
-  if (cct->_conf->rgw_data_notify_interval_msec) {
-    data_notifier = new RGWDataNotifier(this);
-    data_notifier->start();
-  }
-
-  binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
-  binfo_cache->init(svc.cache);
-
-  lc = new RGWLC();
-  lc->initialize(cct, this->driver);
-
-  if (use_lc_thread)
-    lc->start_processor();
-
-  quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads);
-
-  bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
-                             zone.bucket_index_max_shards);
-  if (bucket_index_max_shards > get_max_bucket_shards()) {
-    bucket_index_max_shards = get_max_bucket_shards();
-    ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
-      << get_max_bucket_shards() << dendl;
-  }
-  ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
-
-  bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
-
-  if (need_tombstone_cache) {
-    obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
-  }
-
-  reshard_wait = std::make_shared<RGWReshardWait>();
-
-  reshard = new RGWReshard(this->driver);
-
-  // disable reshard thread based on zone/zonegroup support
-  run_reshard_thread = run_reshard_thread && svc.zone->can_reshard();
-
-  if (run_reshard_thread)  {
-    reshard->start_processor();
-  }
-
-  index_completion_manager = new RGWIndexCompletionManager(this);
-  ret = rgw::notify::init(cct, driver, dpp);
-  if (ret < 0 ) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
-  }
-
-  return ret;
-}
-
-int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
-{
-  if (raw) {
-    return svc.init_raw(cct, use_cache, null_yield, dpp);
-  }
-
-  return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
-}
-
-int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
-{
-  return ctl.init(&svc, driver, dpp);
-}
-
-/** 
- * Initialize the RADOS instance and prepare to do other ops
- * Returns 0 on success, -ERR# on failure.
- */
-int RGWRados::init_begin(const DoutPrefixProvider *dpp)
-{
-  int ret;
-
-  inject_notify_timeout_probability =
-    cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
-  max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
-
-  ret = init_svc(false, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
-    return ret;
-  }
-
-  ret = init_ctl(dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
-    return ret;
-  }
-
-  host_id = svc.zone_utils->gen_host_id();
-
-  return init_rados();
-}
-
-/**
- * Open the pool used as root for this gateway
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
-{
-  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
-}
-
-int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
-{
-  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
-}
-
-int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
-{
-  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
-}
-
-int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
-{
-  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
-}
-
-int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
-{
-  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
-}
-
-int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
-{
-  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
-}
-
-int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
-                           bool mostly_omap)
-{
-  constexpr bool create = true; // create the pool if it doesn't exist
-  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
-}
-
-/**** logs ****/
-
-struct log_list_state {
-  string prefix;
-  librados::IoCtx io_ctx;
-  librados::NObjectIterator obit;
-};
-
-int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
-{
-  log_list_state *state = new log_list_state;
-  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
-  if (r < 0) {
-    delete state;
-    return r;
-  }
-  state->prefix = prefix;
-  state->obit = state->io_ctx.nobjects_begin();
-  *handle = (RGWAccessHandle)state;
-  return 0;
-}
-
-int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
-{
-  log_list_state *state = static_cast<log_list_state *>(handle);
-  while (true) {
-    if (state->obit == state->io_ctx.nobjects_end()) {
-      delete state;
-      return -ENOENT;
-    }
-    if (state->prefix.length() &&
-       state->obit->get_oid().find(state->prefix) != 0) {
-      state->obit++;
-      continue;
-    }
-    *name = state->obit->get_oid();
-    state->obit++;
-    break;
-  }
-  return 0;
-}
-
-int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
-{
-  librados::IoCtx io_ctx;
-  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
-  if (r < 0)
-    return r;
-  return io_ctx.remove(name);
-}
-
-struct log_show_state {
-  librados::IoCtx io_ctx;
-  bufferlist bl;
-  bufferlist::const_iterator p;
-  string name;
-  uint64_t pos;
-  bool eof;
-  log_show_state() : pos(0), eof(false) {}
-};
-
-int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
-{
-  log_show_state *state = new log_show_state;
-  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
-  if (r < 0) {
-    delete state;
-    return r;
-  }
-  state->name = name;
-  *handle = (RGWAccessHandle)state;
-  return 0;
-}
-
-int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
-{
-  log_show_state *state = static_cast<log_show_state *>(handle);
-  off_t off = state->p.get_off();
-
-  ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
-          << " off " << off
-          << " eof " << (int)state->eof
-          << dendl;
-  // read some?
-  unsigned chunk = 1024*1024;
-  if ((state->bl.length() - off) < chunk/2 && !state->eof) {
-    bufferlist more;
-    int r = state->io_ctx.read(state->name, more, chunk, state->pos);
-    if (r < 0)
-      return r;
-    state->pos += r;
-    bufferlist old;
-    try {
-      old.substr_of(state->bl, off, state->bl.length() - off);
-    } catch (buffer::error& err) {
-      return -EINVAL;
-    }
-    state->bl = std::move(old);
-    state->bl.claim_append(more);
-    state->p = state->bl.cbegin();
-    if ((unsigned)r < chunk)
-      state->eof = true;
-    ldpp_dout(dpp, 10) << " read " << r << dendl;
-  }
-
-  if (state->p.end())
-    return 0;  // end of file
-  try {
-    decode(*entry, state->p);
-  }
-  catch (const buffer::error &e) {
-    return -EINVAL;
-  }
-  return 1;
-}
-
-/**
- * usage_log_hash: get usage log key hash, based on name and index
- *
- * Get the usage object name. Since a user may have more than 1
- * object holding that info (multiple shards), we use index to
- * specify that shard number. Once index exceeds max shards it
- * wraps.
- * If name is not being set, results for all users will be returned
- * and index will wrap only after total shards number.
- *
- * @param cct [in] ceph context
- * @param name [in] user name
- * @param hash [out] hash value
- * @param index [in] shard index number 
- */
-static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
-{
-  uint32_t val = index;
-
-  if (!name.empty()) {
-    int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
-    val %= max_user_shards;
-    val += ceph_str_hash_linux(name.c_str(), name.size());
-  }
-  char buf[17];
-  int max_shards = cct->_conf->rgw_usage_max_shards;
-  snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
-  hash = buf;
-}
-
-int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
-{
-  uint32_t index = 0;
-
-  map<string, rgw_usage_log_info> log_objs;
-
-  string hash;
-  string last_user;
-
-  /* restructure usage map, zone by object hash */
-  map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
-  for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
-    const rgw_user_bucket& ub = iter->first;
-    RGWUsageBatch& info = iter->second;
-
-    if (ub.user.empty()) {
-      ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
-      continue;
-    }
-
-    if (ub.user != last_user) {
-      /* index *should* be random, but why waste extra cycles
-         in most cases max user shards is not going to exceed 1,
-         so just incrementing it */
-      usage_log_hash(cct, ub.user, hash, index++);
-    }
-    last_user = ub.user;
-    vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
-
-    for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
-      v.push_back(miter->second);
-    }
-  }
-
-  map<string, rgw_usage_log_info>::iterator liter;
-
-  for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
-    int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
-    if (r < 0)
-      return r;
-  }
-  return 0;
-}
-
-int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
-                         uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
-                        rgw_usage_log_entry>& usage)
-{
-  uint32_t num = max_entries;
-  string hash, first_hash;
-  string user_str = user.to_str();
-  usage_log_hash(cct, user_str, first_hash, 0);
-
-  if (usage_iter.index) {
-    usage_log_hash(cct, user_str, hash, usage_iter.index);
-  } else {
-    hash = first_hash;
-  }
-
-  usage.clear();
-
-  do {
-    map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
-    map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
-
-    int ret =  cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
-                                    usage_iter.read_iter, ret_usage, is_truncated);
-    if (ret == -ENOENT)
-      goto next;
-
-    if (ret < 0)
-      return ret;
-
-    num -= ret_usage.size();
-
-    for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
-      usage[iter->first].aggregate(iter->second);
-    }
-
-next:
-    if (!*is_truncated) {
-      usage_iter.read_iter.clear();
-      usage_log_hash(cct, user_str, hash, ++usage_iter.index);
-    }
-  } while (num && !*is_truncated && hash != first_hash);
-  return 0;
-}
-
-int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
-{
-  uint32_t index = 0;
-  string hash, first_hash;
-  string user_str = user.to_str();
-  usage_log_hash(cct, user_str, first_hash, index);
-
-  hash = first_hash;
-  do {
-    int ret =  cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
-
-    if (ret < 0 && ret != -ENOENT)
-      return ret;
-
-    usage_log_hash(cct, user_str, hash, ++index);
-  } while (hash != first_hash);
-
-  return 0;
-}
-
-
-int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
-{
-  auto max_shards = cct->_conf->rgw_usage_max_shards;
-  int ret=0;
-  for (unsigned i=0; i < max_shards; i++){
-    string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
-    ret = cls_obj_usage_log_clear(dpp, oid);
-    if (ret < 0){
-      ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
-      return ret;
-    }
-  }
-  return ret;
-}
-
-int RGWRados::decode_policy(const DoutPrefixProvider *dpp,
-                           ceph::buffer::list& bl,
-                           ACLOwner *owner)
-{
-  auto i = bl.cbegin();
-  RGWAccessControlPolicy policy(cct);
-  try {
-    policy.decode_owner(i);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
-    return -EIO;
-  }
-  *owner = policy.get_owner();
-  return 0;
-}
-
-int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
-{
-  rgw_bucket bucket = bucket_info.bucket;
-  bucket.update_bucket_id(new_bucket_id);
-
-  bucket_info.objv_tracker.clear();
-  int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return 0;
-}
-
-
-/**
- * Get ordered listing of the objects in a bucket.
- *
- * max_p: maximum number of results to return
- * bucket: bucket to list contents of
- * prefix: only return results that match this prefix
- * delim: do not include results that match this string.
- *     Any skipped results will have the matching portion of their name
- *     inserted in common_prefixes with a "true" mark.
- * marker: if filled in, begin the listing with this object.
- * end_marker: if filled in, end the listing with this object.
- * result: the objects are put in here.
- * common_prefixes: if delim is filled in, any matching prefixes are
- * placed here.
- * is_truncated: if number of objects in the bucket is bigger than
- * max, then truncated.
- */
-int RGWRados::Bucket::List::list_objects_ordered(
-  const DoutPrefixProvider *dpp,
-  int64_t max_p,
-  std::vector<rgw_bucket_dir_entry> *result,
-  std::map<std::string, bool> *common_prefixes,
-  bool *is_truncated,
-  optional_yield y)
-{
-  RGWRados *store = target->get_store();
-  CephContext *cct = store->ctx();
-  int shard_id = target->get_shard_id();
-  const auto& current_index = target->get_bucket_info().layout.current_index;
-
-  int count = 0;
-  bool truncated = true;
-  bool cls_filtered = false;
-  const int64_t max = // protect against memory issues and negative vals
-    std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
-  int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
-
-  result->clear();
-
-  // use a local marker; either the marker will have a previous entry
-  // or it will be empty; either way it's OK to copy
-  rgw_obj_key marker_obj(params.marker.name,
-                        params.marker.instance,
-                        params.ns.empty() ? params.marker.ns : params.ns);
-  rgw_obj_index_key cur_marker;
-  marker_obj.get_index_key(&cur_marker);
-
-  rgw_obj_key end_marker_obj(params.end_marker.name,
-                            params.end_marker.instance,
-                            params.ns.empty() ? params.end_marker.ns : params.ns);
-  rgw_obj_index_key cur_end_marker;
-  end_marker_obj.get_index_key(&cur_end_marker);
-  const bool cur_end_marker_valid = !params.end_marker.empty();
-
-  rgw_obj_key prefix_obj(params.prefix);
-  prefix_obj.set_ns(params.ns);
-  std::string cur_prefix = prefix_obj.get_index_key_name();
-  std::string after_delim_s; /* needed in !params.delim.empty() AND later */
-
-  if (!params.delim.empty()) {
-    after_delim_s = cls_rgw_after_delim(params.delim);
-    /* if marker points at a common prefix, fast forward it into its
-     * upper bound string */
-    int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
-    if (delim_pos >= 0) {
-      string s = cur_marker.name.substr(0, delim_pos);
-      s.append(after_delim_s);
-      cur_marker = s;
-    }
-  }
-
-  // we'll stop after this many attempts as long we return at least
-  // one entry; but we will also go beyond this number of attempts
-  // until we return at least one entry
-  constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
-
-  rgw_obj_index_key prev_marker;
-  for (uint16_t attempt = 1; /* empty */; ++attempt) {
-    ldpp_dout(dpp, 20) << __func__ <<
-      ": starting attempt " << attempt << dendl;
-
-    if (attempt > 1 && !(prev_marker < cur_marker)) {
-      // we've failed to make forward progress
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-       " marker failed to make forward progress; attempt=" << attempt <<
-       ", prev_marker=" << prev_marker <<
-       ", cur_marker=" << cur_marker << dendl;
-      break;
-    }
-    prev_marker = cur_marker;
-
-    ent_map_t ent_map;
-    ent_map.reserve(read_ahead);
-    int r = store->cls_bucket_list_ordered(dpp,
-                                           target->get_bucket_info(),
-                                           current_index,
-                                           shard_id,
-                                          cur_marker,
-                                          cur_prefix,
-                                          params.delim,
-                                          read_ahead + 1 - count,
-                                          params.list_versions,
-                                          attempt,
-                                          ent_map,
-                                          &truncated,
-                                          &cls_filtered,
-                                          &cur_marker,
-                                           y,
-                                          params.force_check_filter);
-    if (r < 0) {
-      return r;
-    }
-
-    for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
-      rgw_bucket_dir_entry& entry = eiter->second;
-      rgw_obj_index_key index_key = entry.key;
-      rgw_obj_key obj(index_key);
-
-      ldpp_dout(dpp, 20) << __func__ <<
-       ": considering entry " << entry.key << dendl;
-
-      /* note that parse_raw_oid() here will not set the correct
-       * object's instance, as rgw_obj_index_key encodes that
-       * separately. We don't need to set the instance because it's
-       * not needed for the checks here and we end up using the raw
-       * entry for the return vector
-       */
-      bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
-      if (!valid) {
-        ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-         " could not parse object name: " << obj.name << dendl;
-        continue;
-      }
-
-      bool matched_ns = (obj.ns == params.ns);
-      if (!params.list_versions && !entry.is_visible()) {
-        ldpp_dout(dpp, 10) << __func__ <<
-         ": skipping not visible entry \"" << entry.key << "\"" << dendl;
-        continue;
-      }
-
-      if (params.enforce_ns && !matched_ns) {
-        if (!params.ns.empty()) {
-          /* we've iterated past the namespace we're searching -- done now */
-          truncated = false;
-         ldpp_dout(dpp, 10) << __func__ <<
-           ": finished due to getting past requested namespace \"" <<
-           params.ns << "\"" << dendl;
-          goto done;
-        }
-
-        /* we're skipping past namespaced objects */
-       ldpp_dout(dpp, 20) << __func__ <<
-         ": skipping past namespaced objects, including \"" << entry.key <<
-         "\"" << dendl;
-        continue;
-      }
-
-      if (cur_end_marker_valid && cur_end_marker <= index_key) {
-        truncated = false;
-       ldpp_dout(dpp, 10) << __func__ <<
-         ": finished due to gitting end marker of \"" << cur_end_marker <<
-         "\" with \"" << entry.key << "\"" << dendl;
-        goto done;
-      }
-
-      if (count < max) {
-       params.marker = index_key;
-       next_marker = index_key;
-      }
-
-      if (params.access_list_filter &&
-         ! params.access_list_filter->filter(obj.name, index_key.name)) {
-       ldpp_dout(dpp, 20) << __func__ <<
-         ": skipping past namespaced objects, including \"" << entry.key <<
-         "\"" << dendl;
-        continue;
-      }
-
-      if (params.prefix.size() &&
-         0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
-       ldpp_dout(dpp, 20) << __func__ <<
-         ": skipping object \"" << entry.key <<
-         "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
-        continue;
-      }
-
-      if (!params.delim.empty()) {
-       const int delim_pos = obj.name.find(params.delim, params.prefix.size());
-       if (delim_pos >= 0) {
-         // run either the code where delimiter filtering is done a)
-         // in the OSD/CLS or b) here.
-         if (cls_filtered) {
-           // NOTE: this condition is for the newer versions of the
-           // OSD that does filtering on the CLS side should only
-           // find one delimiter at the end if it finds any after the
-           // prefix
-           if (delim_pos !=
-               int(obj.name.length() - params.delim.length())) {
-             ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
-               " found delimiter in place other than the end of "
-               "the prefix; obj.name=" << obj.name <<
-               ", prefix=" << params.prefix << dendl;
-           }
-           if (common_prefixes) {
-             if (count >= max) {
-               truncated = true;
-               ldpp_dout(dpp, 10) << __func__ <<
-                 ": stopping early with common prefix \"" << entry.key <<
-                 "\" because requested number (" << max <<
-                 ") reached (cls filtered)" << dendl;
-               goto done;
-             }
-
-             (*common_prefixes)[obj.name] = true;
-             count++;
-           }
-
-           ldpp_dout(dpp, 20) << __func__ <<
-             ": finished entry with common prefix \"" << entry.key <<
-             "\" so continuing loop (cls filtered)" << dendl;
-           continue;
-         } else {
-           // NOTE: this condition is for older versions of the OSD
-           // that do not filter on the CLS side, so the following code
-           // must do the filtering; once we reach version 16 of ceph,
-           // this code can be removed along with the conditional that
-           // can lead this way
-
-           /* extract key -with trailing delimiter- for CommonPrefix */
-           string prefix_key =
-             obj.name.substr(0, delim_pos + params.delim.length());
-
-           if (common_prefixes &&
-               common_prefixes->find(prefix_key) == common_prefixes->end()) {
-             if (count >= max) {
-               truncated = true;
-               ldpp_dout(dpp, 10) << __func__ <<
-                 ": stopping early with common prefix \"" << entry.key <<
-                 "\" because requested number (" << max <<
-                 ") reached (not cls filtered)" << dendl;
-               goto done;
-             }
-             next_marker = prefix_key;
-             (*common_prefixes)[prefix_key] = true;
-
-             count++;
-           }
-
-           ldpp_dout(dpp, 20) << __func__ <<
-             ": finished entry with common prefix \"" << entry.key <<
-             "\" so continuing loop (not cls filtered)" << dendl;
-           continue;
-         } // if we're running an older OSD version
-       } // if a delimiter was found after prefix
-      } // if a delimiter was passed in
-
-      if (count >= max) {
-        truncated = true;
-       ldpp_dout(dpp, 10) << __func__ <<
-         ": stopping early with entry \"" << entry.key <<
-         "\" because requested number (" << max <<
-         ") reached" << dendl;
-        goto done;
-      }
-
-      ldpp_dout(dpp, 20) << __func__ <<
-       ": adding entry " << entry.key << " to result" << dendl;
-
-      result->emplace_back(std::move(entry));
-      count++;
-    } // eiter for loop
-
-    // NOTE: the following conditional is needed by older versions of
-    // the OSD that don't do delimiter filtering on the CLS side; once
-    // we reach version 16 of ceph, the following conditional and the
-    // code within can be removed
-    if (!cls_filtered && !params.delim.empty()) {
-      int marker_delim_pos =
-       cur_marker.name.find(params.delim, cur_prefix.size());
-      if (marker_delim_pos >= 0) {
-       std::string skip_after_delim =
-         cur_marker.name.substr(0, marker_delim_pos);
-        skip_after_delim.append(after_delim_s);
-
-        ldpp_dout(dpp, 20) << __func__ <<
-         ": skip_after_delim=" << skip_after_delim << dendl;
-
-        if (skip_after_delim > cur_marker.name) {
-          cur_marker = skip_after_delim;
-          ldpp_dout(dpp, 20) << __func__ <<
-           ": setting cur_marker=" << cur_marker.name <<
-           "[" << cur_marker.instance << "]" << dendl;
-        }
-      }
-    } // if older osd didn't do delimiter filtering
-
-    ldpp_dout(dpp, 10) << __func__ <<
-      ": end of outer loop, truncated=" << truncated <<
-      ", count=" << count << ", attempt=" << attempt << dendl;
-
-    if (!truncated || count >= (max + 1) / 2) {
-      // if we finished listing, or if we're returning at least half the
-      // requested entries, that's enough; S3 and swift protocols allow
-      // returning fewer than max entries
-      ldpp_dout(dpp, 10) << __func__ <<
-       ": exiting attempt loop because we reached end (" << truncated <<
-       ") or we're returning half the requested entries (" << count <<
-       " of " << max << ")" << dendl;
-      break;
-    } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
-      // if we've made at least 8 attempts and we have some, but very
-      // few, results, return with what we have
-      ldpp_dout(dpp, 10) << __func__ <<
-       ": exiting attempt loop because we made " << attempt <<
-       " attempts and we're returning " << count << " entries" << dendl;
-      break;
-    }
-  } // for (uint16_t attempt...
-
-done:
-
-  if (is_truncated) {
-    *is_truncated = truncated;
-  }
-
-  return 0;
-} // list_objects_ordered
-
-
-/**
- * Get listing of the objects in a bucket and allow the results to be out
- * of order.
- *
- * Even though there are key differences with the ordered counterpart,
- * the parameters are the same to maintain some compatability.
- *
- * max: maximum number of results to return
- * bucket: bucket to list contents of
- * prefix: only return results that match this prefix
- * delim: should not be set; if it is we should have indicated an error
- * marker: if filled in, begin the listing with this object.
- * end_marker: if filled in, end the listing with this object.
- * result: the objects are put in here.
- * common_prefixes: this is never filled with an unordered list; the param
- *                  is maintained for compatibility
- * is_truncated: if number of objects in the bucket is bigger than max, then
- *               truncated.
- */
-int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
-                                                   int64_t max_p,
-                                                  std::vector<rgw_bucket_dir_entry>* result,
-                                                  std::map<std::string, bool>* common_prefixes,
-                                                  bool* is_truncated,
-                                                   optional_yield y)
-{
-  RGWRados *store = target->get_store();
-  int shard_id = target->get_shard_id();
-  const auto& current_index = target->get_bucket_info().layout.current_index;
-
-  int count = 0;
-  bool truncated = true;
-
-  const int64_t max = // protect against memory issues and negative vals
-    std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
-
-  // read a few extra in each call to cls_bucket_list_unordered in
-  // case some are filtered out due to namespace matching, versioning,
-  // filtering, etc.
-  const int64_t max_read_ahead = 100;
-  const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
-
-  result->clear();
-
-  // use a local marker; either the marker will have a previous entry
-  // or it will be empty; either way it's OK to copy
-  rgw_obj_key marker_obj(params.marker.name,
-                        params.marker.instance,
-                        params.ns.empty() ? params.marker.ns : params.ns);
-  rgw_obj_index_key cur_marker;
-  marker_obj.get_index_key(&cur_marker);
-
-  rgw_obj_key end_marker_obj(params.end_marker.name,
-                            params.end_marker.instance,
-                            params.ns.empty() ? params.end_marker.ns : params.ns);
-  rgw_obj_index_key cur_end_marker;
-  end_marker_obj.get_index_key(&cur_end_marker);
-  const bool cur_end_marker_valid = !params.end_marker.empty();
-
-  rgw_obj_key prefix_obj(params.prefix);
-  prefix_obj.set_ns(params.ns);
-  std::string cur_prefix = prefix_obj.get_index_key_name();
-
-  while (truncated && count <= max) {
-    std::vector<rgw_bucket_dir_entry> ent_list;
-    ent_list.reserve(read_ahead);
-
-    int r = store->cls_bucket_list_unordered(dpp,
-                                             target->get_bucket_info(),
-                                             current_index,
-                                             shard_id,
-                                            cur_marker,
-                                            cur_prefix,
-                                            read_ahead,
-                                            params.list_versions,
-                                            ent_list,
-                                            &truncated,
-                                            &cur_marker,
-                                             y);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-       " cls_bucket_list_unordered returned " << r << " for " <<
-       target->get_bucket_info().bucket << dendl;
-      return r;
-    }
-
-    // NB: while regions of ent_list will be sorted, we have no
-    // guarantee that all items will be sorted since they can cross
-    // shard boundaries
-
-    for (auto& entry : ent_list) {
-      rgw_obj_index_key index_key = entry.key;
-      rgw_obj_key obj(index_key);
-
-      if (count < max) {
-       params.marker.set(index_key);
-       next_marker.set(index_key);
-      }
-
-      /* note that parse_raw_oid() here will not set the correct
-       * object's instance, as rgw_obj_index_key encodes that
-       * separately. We don't need to set the instance because it's
-       * not needed for the checks here and we end up using the raw
-       * entry for the return vector
-       */
-      bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
-      if (!valid) {
-        ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-         " could not parse object name: " << obj.name << dendl;
-        continue;
-      }
-
-      if (!params.list_versions && !entry.is_visible()) {
-        ldpp_dout(dpp, 20) << __func__ <<
-         ": skippping \"" << index_key <<
-         "\" because not listing versions and entry not visibile" << dendl;
-        continue;
-      }
-
-      if (params.enforce_ns && obj.ns != params.ns) {
-        ldpp_dout(dpp, 20) << __func__ <<
-         ": skippping \"" << index_key <<
-         "\" because namespace does not match" << dendl;
-        continue;
-      }
-
-      if (cur_end_marker_valid && cur_end_marker <= index_key) {
-       // we're not guaranteed items will come in order, so we have
-       // to loop through all
-        ldpp_dout(dpp, 20) << __func__ <<
-         ": skippping \"" << index_key <<
-         "\" because after end_marker" << dendl;
-       continue;
-      }
-
-      if (params.access_list_filter &&
-         !params.access_list_filter->filter(obj.name, index_key.name)) {
-        ldpp_dout(dpp, 20) << __func__ <<
-         ": skippping \"" << index_key <<
-         "\" because doesn't match filter" << dendl;
-        continue;
-      }
-
-      if (params.prefix.size() &&
-         (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
-        ldpp_dout(dpp, 20) << __func__ <<
-         ": skippping \"" << index_key <<
-         "\" because doesn't match prefix" << dendl;
-       continue;
-      }
-
-      if (count >= max) {
-        truncated = true;
-        goto done;
-      }
-
-      result->emplace_back(std::move(entry));
-      count++;
-    } // for (auto& entry : ent_list)
-  } // while (truncated && count <= max)
-
-done:
-
-  if (is_truncated) {
-    *is_truncated = truncated;
-  }
-
-  return 0;
-} // list_objects_unordered
-
-
-/**
- * create a rados pool, associated meta info
- * returns 0 on success, -ERR# otherwise.
- */
-int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
-{
-  librados::IoCtx io_ctx;
-  constexpr bool create = true;
-  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
-}
-
-void RGWRados::create_bucket_id(string *bucket_id)
-{
-  uint64_t iid = instance_id();
-  uint64_t bid = next_bucket_id();
-  char buf[svc.zone->get_zone_params().get_id().size() + 48];
-  snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
-           svc.zone->get_zone_params().get_id().c_str(), iid, bid);
-  *bucket_id = buf;
-}
-
-int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
-                            const string& zonegroup_id,
-                            const rgw_placement_rule& placement_rule,
-                            const string& swift_ver_location,
-                            const RGWQuotaInfo * pquota_info,
-                           map<std::string, bufferlist>& attrs,
-                            RGWBucketInfo& info,
-                            obj_version *pobjv,
-                            obj_version *pep_objv,
-                            real_time creation_time,
-                            rgw_bucket *pmaster_bucket,
-                            uint32_t *pmaster_num_shards,
-                           optional_yield y,
-                            const DoutPrefixProvider *dpp,
-                           bool exclusive)
-{
-#define MAX_CREATE_RETRIES 20 /* need to bound retries */
-  rgw_placement_rule selected_placement_rule;
-  RGWZonePlacementInfo rule_info;
-
-  for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
-    int ret = 0;
-    ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
-                                            &selected_placement_rule, &rule_info, y);
-    if (ret < 0)
-      return ret;
-
-    if (!pmaster_bucket) {
-      create_bucket_id(&bucket.marker);
-      bucket.bucket_id = bucket.marker;
-    } else {
-      bucket.marker = pmaster_bucket->marker;
-      bucket.bucket_id = pmaster_bucket->bucket_id;
-    }
-
-    RGWObjVersionTracker& objv_tracker = info.objv_tracker;
-
-    objv_tracker.read_version.clear();
-
-    if (pobjv) {
-      objv_tracker.write_version = *pobjv;
-    } else {
-      objv_tracker.generate_new_write_ver(cct);
-    }
-
-    info.bucket = bucket;
-    info.owner = owner.user_id;
-    info.zonegroup = zonegroup_id;
-    info.placement_rule = selected_placement_rule;
-    info.swift_ver_location = swift_ver_location;
-    info.swift_versioning = (!swift_ver_location.empty());
-
-    init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
-                              pmaster_num_shards ?
-                              std::optional{*pmaster_num_shards} :
-                              std::nullopt,
-                              rule_info.index_type);
-
-    info.requester_pays = false;
-    if (real_clock::is_zero(creation_time)) {
-      info.creation_time = ceph::real_clock::now();
-    } else {
-      info.creation_time = creation_time;
-    }
-    if (pquota_info) {
-      info.quota = *pquota_info;
-    }
-
-    int r = svc.bi->init_index(dpp, info, info.layout.current_index);
-    if (r < 0) {
-      return r;
-    }
-
-    ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
-    if (ret == -ECANCELED) {
-      ret = -EEXIST;
-    }
-    if (ret == -EEXIST) {
-       /* we need to reread the info and return it, caller will have a use for it */
-      RGWBucketInfo orig_info;
-      r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
-      if (r < 0) {
-        if (r == -ENOENT) {
-          continue;
-        }
-        ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
-        return r;
-      }
-
-      /* only remove it if it's a different bucket instance */
-      if (orig_info.bucket.bucket_id != bucket.bucket_id) {
-       int r = svc.bi->clean_index(dpp, info, info.layout.current_index);
-       if (r < 0) {
-         ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
-       }
-        r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
-        if (r < 0) {
-          ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
-          /* continue anyway */
-        }
-      }
-
-      info = std::move(orig_info);
-      /* ret == -EEXIST here */
-    }
-    return ret;
-  }
-
-  /* this is highly unlikely */
-  ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
-  return -ENOENT;
-}
-
-bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
-{
-  get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
-
-  return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
-}
-
-std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  return svc.rados->cluster_fsid();
-}
-
-int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
-                                const RGWBucketInfo& bucket_info,
-                                const rgw_obj& obj,
-                                librados::IoCtx *ioctx)
-{
-  std::string oid, key;
-  get_obj_bucket_and_oid_loc(obj, oid, key);
-
-  rgw_pool pool;
-  if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
-    ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
-      ", probably misconfiguration" << dendl;
-    return -EIO;
-  }
-
-  int r = open_pool_ctx(dpp, pool, *ioctx, false);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
-      " for obj=" << obj << " with error-code=" << r << dendl;
-    return r;
-  }
-
-  ioctx->locator_set_key(key);
-
-  return 0;
-}
-
-int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
-                               const rgw_placement_rule& target_placement_rule,
-                               const rgw_obj& obj,
-                               rgw_rados_ref *ref)
-{
-  get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
-
-  rgw_pool pool;
-  if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
-    ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
-    return -EIO;
-  }
-
-  ref->pool = svc.rados->pool(pool);
-
-  int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
-                         .set_mostly_omap(false));
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
-    return r;
-  }
-
-  ref->pool.ioctx().locator_set_key(ref->obj.loc);
-
-  return 0;
-}
-
-int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
-                               const RGWBucketInfo& bucket_info,
-                               const rgw_obj& obj,
-                               rgw_rados_ref *ref)
-{
-  return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
-}
-
-int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
-{
-  ref->obj = obj;
-
-  if (ref->obj.oid.empty()) {
-    ref->obj.oid = obj.pool.to_str();
-    ref->obj.pool = svc.zone->get_zone_params().domain_root;
-  }
-  ref->pool = svc.rados->pool(obj.pool);
-  int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
-                         .set_mostly_omap(false));
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
-    return r;
-  }
-
-  ref->pool.ioctx().locator_set_key(ref->obj.loc);
-
-  return 0;
-}
-
-int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
-{
-  return get_raw_obj_ref(dpp, obj, ref);
-}
-
-/*
- * fixes an issue where head objects were supposed to have a locator created, but ended
- * up without one
- */
-int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
-{
-  const rgw_bucket& bucket = bucket_info.bucket;
-  string oid;
-  string locator;
-
-  rgw_obj obj(bucket, key);
-
-  get_obj_bucket_and_oid_loc(obj, oid, locator);
-
-  if (locator.empty()) {
-    ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
-    return 0;
-  }
-
-  librados::IoCtx ioctx;
-
-  int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
-  if (ret < 0) {
-    cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
-    return ret;
-  }
-  ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
-
-  uint64_t size;
-  bufferlist data;
-
-  struct timespec mtime_ts;
-  map<string, bufferlist> attrs;
-  librados::ObjectReadOperation op;
-  op.getxattrs(&attrs, NULL);
-  op.stat2(&size, &mtime_ts, NULL);
-#define HEAD_SIZE 512 * 1024
-  op.read(0, HEAD_SIZE, &data, NULL);
-
-  ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  if (size > HEAD_SIZE) {
-    ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
-    return -EIO;
-  }
-
-  if (size != data.length()) {
-    ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
-    return -EIO;
-  }
-
-  if (copy_obj) {
-    librados::ObjectWriteOperation wop;
-
-    wop.mtime2(&mtime_ts);
-
-    map<string, bufferlist>::iterator iter;
-    for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
-      wop.setxattr(iter->first.c_str(), iter->second);
-    }
-
-    wop.write(0, data);
-
-    ioctx.locator_set_key(locator);
-    rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
-  }
-
-  if (remove_bad) {
-    ioctx.locator_set_key(string());
-
-    ret = ioctx.remove(oid);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
-      return ret;
-    }
-  }
-
-  return 0;
-}
-
-int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
-                             librados::IoCtx& src_ioctx,
-                            const string& src_oid, const string& src_locator,
-                            librados::IoCtx& dst_ioctx,
-                            const string& dst_oid, const string& dst_locator)
-{
-
-#define COPY_BUF_SIZE (4 * 1024 * 1024)
-  bool done = false;
-  uint64_t chunk_size = COPY_BUF_SIZE;
-  uint64_t ofs = 0;
-  int ret = 0;
-  real_time mtime;
-  struct timespec mtime_ts;
-  uint64_t size;
-
-  if (src_oid == dst_oid && src_locator == dst_locator) {
-    return 0;
-  }
-
-  src_ioctx.locator_set_key(src_locator);
-  dst_ioctx.locator_set_key(dst_locator);
-
-  do {
-    bufferlist data;
-    ObjectReadOperation rop;
-    ObjectWriteOperation wop;
-
-    if (ofs == 0) {
-      rop.stat2(&size, &mtime_ts, NULL);
-      mtime = real_clock::from_timespec(mtime_ts);
-    }
-    rop.read(ofs, chunk_size, &data, NULL);
-    ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
-    if (ret < 0) {
-      goto done_err;
-    }
-
-    if (data.length() == 0) {
-      break;
-    }
-
-    if (ofs == 0) {
-      wop.create(true); /* make it exclusive */
-      wop.mtime2(&mtime_ts);
-      mtime = real_clock::from_timespec(mtime_ts);
-    }
-    wop.write(ofs, data);
-    ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
-    if (ret < 0) {
-      goto done_err;
-    }
-    ofs += data.length();
-    done = data.length() != chunk_size;
-  } while (!done);
-
-  if (ofs != size) {
-    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
-               << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
-    ret = -EIO;
-    goto done_err;
-  }
-
-  src_ioctx.remove(src_oid);
-
-  return 0;
-
-done_err:
-  // TODO: clean up dst_oid if we created it
-  ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
-  return ret;
-}
-
-/*
- * fixes an issue where head objects were supposed to have a locator created, but ended
- * up without one
- */
-int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp,
-                                   RGWBucketInfo& bucket_info, rgw_obj_key& key,
-                                   bool fix, bool *need_fix, optional_yield y)
-{
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  driver->get_bucket(nullptr, bucket_info, &bucket);
-  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
-
-  if (need_fix) {
-    *need_fix = false;
-  }
-
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  RGWObjState *astate = nullptr;
-  RGWObjManifest* manifest = nullptr;
-  RGWObjectCtx rctx(this->driver);
-  r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
-  if (r < 0)
-    return r;
-
-  if (manifest) {
-    RGWObjManifest::obj_iterator miter;
-    for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
-      rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(driver);
-      rgw_obj loc;
-      string oid;
-      string locator;
-
-      RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc);
-
-      if (loc.key.ns.empty()) {
-       /* continue, we're only interested in tail objects */
-       continue;
-      }
-
-      auto& ioctx = ref.pool.ioctx();
-
-      get_obj_bucket_and_oid_loc(loc, oid, locator);
-      ref.pool.ioctx().locator_set_key(locator);
-
-      ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
-
-      r = ioctx.stat(oid, NULL, NULL);
-      if (r != -ENOENT) {
-       continue;
-      }
-
-      string bad_loc;
-      prepend_bucket_marker(bucket->get_key(), loc.key.name, bad_loc);
-
-      /* create a new ioctx with the bad locator */
-      librados::IoCtx src_ioctx;
-      src_ioctx.dup(ioctx);
-      src_ioctx.locator_set_key(bad_loc);
-
-      r = src_ioctx.stat(oid, NULL, NULL);
-      if (r != 0) {
-       /* cannot find a broken part */
-       continue;
-      }
-      ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
-      if (need_fix) {
-        *need_fix = true;
-      }
-      if (fix) {
-        r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
-        if (r < 0) {
-          ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
-        }
-      }
-    }
-  }
-
-  return 0;
-}
-
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
-                               const rgw_obj& obj,
-                               RGWBucketInfo* bucket_info_out,
-                                const DoutPrefixProvider *dpp)
-{
-  bucket = _bucket;
-
-  RGWBucketInfo bucket_info;
-  RGWBucketInfo* bucket_info_p =
-    bucket_info_out ? bucket_info_out : &bucket_info;
-
-  int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  string oid;
-
-  ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
-    return ret;
-  }
-  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
-
-  return 0;
-}
-
-int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
-                                const rgw_obj& obj)
-{
-  bucket = bucket_info.bucket;
-
-  int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
-                                                        obj.get_hash_object(),
-                                                        &bucket_obj,
-                                                        &shard_id);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
-    return ret;
-  }
-  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
-
-  return 0;
-}
-
-int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp,
-                               const RGWBucketInfo& bucket_info,
-                                const rgw::bucket_index_layout_generation& index,
-                                int sid)
-{
-  bucket = bucket_info.bucket;
-  shard_id = sid;
-
-  int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id,
-                                                         num_shards(index), index.gen,
-                                                         &bucket_obj);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
-    return ret;
-  }
-  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
-
-  return 0;
-}
-
-
-/* Execute @handler on last item in bucket listing for bucket specified
- * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
- * to objects matching these criterias. */
-int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
-                                       RGWBucketInfo& bucket_info,
-                                       const std::string& obj_prefix,
-                                       const std::string& obj_delim,
-                                       std::function<int(const rgw_bucket_dir_entry&)> handler)
-{
-  RGWRados::Bucket target(this, bucket_info);
-  RGWRados::Bucket::List list_op(&target);
-
-  list_op.params.prefix = obj_prefix;
-  list_op.params.delim = obj_delim;
-
-  ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
-                 << ", obj_prefix=" << obj_prefix
-                 << ", obj_delim=" << obj_delim
-                 << dendl;
-
-  bool is_truncated = false;
-
-  boost::optional<rgw_bucket_dir_entry> last_entry;
-  /* We need to rewind to the last object in a listing. */
-  do {
-    /* List bucket entries in chunks. */
-    static constexpr int MAX_LIST_OBJS = 100;
-    std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
-
-    int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
-                                   &is_truncated, null_yield);
-    if (ret < 0) {
-      return ret;
-    } else if (!entries.empty()) {
-      last_entry = entries.back();
-    }
-  } while (is_truncated);
-
-  if (last_entry) {
-    return handler(*last_entry);
-  }
-
-  /* Empty listing - no items we can run handler on. */
-  return 0;
-}
-
-bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const
-{
-  return bucket->get_info().has_swift_versioning() &&
-    bucket->get_info().swift_ver_location.size();
-}
-
-int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
-                                    const rgw_user& user,
-                                    rgw::sal::Bucket* bucket,
-                                    rgw::sal::Object* obj,
-                                    const DoutPrefixProvider *dpp,
-                                    optional_yield y)
-{
-  if (! swift_versioning_enabled(bucket)) {
-    return 0;
-  }
-
-  obj->set_atomic();
-
-  RGWObjState * state = nullptr;
-  RGWObjManifest *manifest = nullptr;
-  int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj, &state, &manifest, false, y);
-  if (r < 0) {
-    return r;
-  }
-
-  if (!state->exists) {
-    return 0;
-  }
-
-  const string& src_name = obj->get_oid();
-  char buf[src_name.size() + 32];
-  struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
-  snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
-           src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
-
-  RGWBucketInfo dest_bucket_info;
-
-  r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
-  if (r < 0) {
-    ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
-    if (r == -ENOENT) {
-      return -ERR_PRECONDITION_FAILED;
-    }
-    return r;
-  }
-
-  if (dest_bucket_info.owner != bucket->get_info().owner) {
-    return -ERR_PRECONDITION_FAILED;
-  }
-
-  rgw::sal::RadosBucket dest_bucket(driver, dest_bucket_info);
-  rgw::sal::RadosObject dest_obj(driver, rgw_obj_key(buf), &dest_bucket);
-
-  if (dest_bucket_info.versioning_enabled()){
-    dest_obj.gen_rand_obj_instance_name();
-  }
-
-  dest_obj.set_atomic();
-
-  rgw_zone_id no_zone;
-
-  r = copy_obj(obj_ctx,
-               user,
-               NULL, /* req_info *info */
-               no_zone,
-               &dest_obj,
-               obj,
-               &dest_bucket,
-               bucket,
-               bucket->get_placement_rule(),
-               NULL, /* time_t *src_mtime */
-               NULL, /* time_t *mtime */
-               NULL, /* const time_t *mod_ptr */
-               NULL, /* const time_t *unmod_ptr */
-               false, /* bool high_precision_time */
-               NULL, /* const char *if_match */
-               NULL, /* const char *if_nomatch */
-               RGWRados::ATTRSMOD_NONE,
-               true, /* bool copy_if_newer */
-               state->attrset,
-               RGWObjCategory::Main,
-               0, /* uint64_t olh_epoch */
-               real_time(), /* time_t delete_at */
-               NULL, /* string *version_id */
-               NULL, /* string *ptag */
-               NULL, /* string *petag */
-               NULL, /* void (*progress_cb)(off_t, void *) */
-               NULL, /* void *progress_data */
-               dpp,
-               null_yield);
-  if (r == -ECANCELED || r == -ENOENT) {
-    /* Has already been overwritten, meaning another rgw process already
-     * copied it out */
-    return 0;
-  }
-
-  return r;
-}
-
-int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
-                                       const rgw_user& user,
-                                       rgw::sal::Bucket* bucket,
-                                       rgw::sal::Object* obj,
-                                       bool& restored,                  /* out */
-                                       const DoutPrefixProvider *dpp)
-{
-  if (! swift_versioning_enabled(bucket)) {
-    return 0;
-  }
-
-  /* Bucket info of the bucket that stores previous versions of our object. */
-  RGWBucketInfo archive_binfo;
-
-  int ret = get_bucket_info(&svc, bucket->get_tenant(),
-                            bucket->get_info().swift_ver_location,
-                           archive_binfo, nullptr, null_yield, nullptr);
-  if (ret < 0) {
-    return ret;
-  }
-
-  /* Abort the operation if the bucket storing our archive belongs to someone
-   * else. This is a limitation in comparison to Swift as we aren't taking ACLs
-   * into consideration. For we can live with that.
-   *
-   * TODO: delegate this check to un upper layer and compare with ACLs. */
-  if (bucket->get_info().owner != archive_binfo.owner) {
-    return -EPERM;
-  }
-
-  /* This code will be executed on latest version of the object. */
-  const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
-    rgw_zone_id no_zone;
-
-    /* We don't support object versioning of Swift API on those buckets that
-     * are already versioned using the S3 mechanism. This affects also bucket
-     * storing archived objects. Otherwise the delete operation would create
-     * a deletion marker. */
-    if (archive_binfo.versioned()) {
-      restored = false;
-      return -ERR_PRECONDITION_FAILED;
-    }
-
-    /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
-     * irrelevant and may be safely skipped. */
-    std::map<std::string, ceph::bufferlist> no_attrs;
-
-    rgw::sal::RadosBucket archive_bucket(driver, archive_binfo);
-    rgw::sal::RadosObject archive_obj(driver, entry.key, &archive_bucket);
-
-    if (bucket->versioning_enabled()){
-      obj->gen_rand_obj_instance_name();
-    }
-
-    archive_obj.set_atomic();
-    obj->set_atomic();
-
-    int ret = copy_obj(obj_ctx,
-                       user,
-                       nullptr,       /* req_info *info */
-                       no_zone,
-                       obj,           /* dest obj */
-                       &archive_obj,   /* src obj */
-                       bucket,   /* dest bucket info */
-                       &archive_bucket, /* src bucket info */
-                       bucket->get_placement_rule(),  /* placement_rule */
-                       nullptr,       /* time_t *src_mtime */
-                       nullptr,       /* time_t *mtime */
-                       nullptr,       /* const time_t *mod_ptr */
-                       nullptr,       /* const time_t *unmod_ptr */
-                       false,         /* bool high_precision_time */
-                       nullptr,       /* const char *if_match */
-                       nullptr,       /* const char *if_nomatch */
-                       RGWRados::ATTRSMOD_NONE,
-                       true,          /* bool copy_if_newer */
-                       no_attrs,
-                       RGWObjCategory::Main,
-                       0,             /* uint64_t olh_epoch */
-                       real_time(),   /* time_t delete_at */
-                       nullptr,       /* string *version_id */
-                       nullptr,       /* string *ptag */
-                       nullptr,       /* string *petag */
-                       nullptr,       /* void (*progress_cb)(off_t, void *) */
-                       nullptr,       /* void *progress_data */
-                       dpp,
-                       null_yield);
-    if (ret == -ECANCELED || ret == -ENOENT) {
-      /* Has already been overwritten, meaning another rgw process already
-       * copied it out */
-      return 0;
-    } else if (ret < 0) {
-      return ret;
-    } else {
-      restored = true;
-    }
-
-    /* Need to remove the archived copy. */
-    ret = delete_obj(dpp, archive_binfo, &archive_obj,
-                     archive_binfo.versioning_status());
-
-    return ret;
-  };
-
-  const std::string& obj_name = obj->get_oid();
-  const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
-                                                         % obj_name);
-
-  return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
-                                  handler);
-}
-
-int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
-                                           uint64_t size, uint64_t accounted_size,
-                                           map<string, bufferlist>& attrs,
-                                           bool assume_noent, bool modify_tail,
-                                           void *_index_op, optional_yield y)
-{
-  RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
-  RGWRados *store = target->get_store();
-
-  ObjectWriteOperation op;
-#ifdef WITH_LTTNG
-  const req_state* s =  get_req_state();
-  string req_id;
-  if (!s) {
-    // fake req_id
-    req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
-  } else {
-    req_id = s->req_id;
-  }
-#endif
-
-  RGWObjState *state;
-  RGWObjManifest *manifest = nullptr;
-  int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent);
-  if (r < 0)
-    return r;
-
-  rgw_obj obj = target->get_obj();
-
-  if (obj.get_oid().empty()) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
-    return -EIO;
-  }
-
-  rgw_rados_ref ref;
-  r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
-  if (r < 0)
-    return r;
-
-  bool is_olh = state->is_olh;
-
-  bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
-
-  const string *ptag = meta.ptag;
-  if (!ptag && !index_op->get_optag()->empty()) {
-    ptag = index_op->get_optag();
-  }
-  r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
-  if (r < 0)
-    return r;
-
-  if (real_clock::is_zero(meta.set_mtime)) {
-    meta.set_mtime = real_clock::now();
-  }
-
-  if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
-    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
-    if (iter == attrs.end()) {
-      real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime);
-      string mode = target->get_bucket_info().obj_lock.get_mode();
-      RGWObjectRetention obj_retention(mode, lock_until_date);
-      bufferlist bl;
-      obj_retention.encode(bl);
-      op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
-    }
-  }
-
-  if (state->is_olh) {
-    op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
-  }
-
-  struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
-  op.mtime2(&mtime_ts);
-
-  if (meta.data) {
-    /* if we want to overwrite the data, we also want to overwrite the
-       xattrs, so just remove the object */
-    op.write_full(*meta.data);
-    if (state->compressed) {
-      uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
-      op.set_alloc_hint2(0, 0, alloc_hint_flags);
-    }
-  }
-
-  string etag;
-  string content_type;
-  bufferlist acl_bl;
-  string storage_class;
-
-  map<string, bufferlist>::iterator iter;
-  if (meta.rmattrs) {
-    for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
-      const string& name = iter->first;
-      op.rmxattr(name.c_str());
-    }
-  }
-
-  if (meta.manifest) {
-    storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
-
-    /* remove existing manifest attr */
-    iter = attrs.find(RGW_ATTR_MANIFEST);
-    if (iter != attrs.end())
-      attrs.erase(iter);
-
-    bufferlist bl;
-    encode(*meta.manifest, bl);
-    op.setxattr(RGW_ATTR_MANIFEST, bl);
-  }
-
-  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
-    const string& name = iter->first;
-    bufferlist& bl = iter->second;
-
-    if (!bl.length())
-      continue;
-
-    op.setxattr(name.c_str(), bl);
-
-    if (name.compare(RGW_ATTR_ETAG) == 0) {
-      etag = rgw_bl_str(bl);
-    } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
-      content_type = rgw_bl_str(bl);
-    } else if (name.compare(RGW_ATTR_ACL) == 0) {
-      acl_bl = bl;
-    }
-  }
-  if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
-    cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
-  }
-
-  if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
-    bufferlist bl;
-    encode(store->svc.zone->get_zone_short_id(), bl);
-    op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
-  }
-
-  if (!storage_class.empty()) {
-    bufferlist bl;
-    bl.append(storage_class);
-    op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
-  }
-
-  if (!op.size())
-    return 0;
-
-  uint64_t epoch;
-  int64_t poolid;
-  bool orig_exists;
-  uint64_t orig_size;
-  
-  if (!reset_obj) {    //Multipart upload, it has immutable head. 
-    orig_exists = false;
-    orig_size = 0;
-  } else {
-    orig_exists = state->exists;
-    orig_size = state->accounted_size;
-  }
-
-  bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
-                          !obj.key.instance.empty();
-
-  bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
-
-  if (versioned_op) {
-    index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
-  }
-
-  if (!index_op->is_prepared()) {
-    tracepoint(rgw_rados, prepare_enter, req_id.c_str());
-    r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
-    tracepoint(rgw_rados, prepare_exit, req_id.c_str());
-    if (r < 0)
-      return r;
-  }
-
-  auto& ioctx = ref.pool.ioctx();
-
-  tracepoint(rgw_rados, operate_enter, req_id.c_str());
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-  tracepoint(rgw_rados, operate_exit, req_id.c_str());
-  if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
-                or -ENOENT if was removed, or -EEXIST if it did not exist
-                before and now it does */
-    if (r == -EEXIST && assume_noent) {
-      target->invalidate_state();
-      return r;
-    }
-    goto done_cancel;
-  }
-
-  epoch = ioctx.get_last_version();
-  poolid = ioctx.get_id();
-
-  r = target->complete_atomic_modification(dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
-  }
-
-  tracepoint(rgw_rados, complete_enter, req_id.c_str());
-  r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
-                        meta.set_mtime, etag, content_type,
-                        storage_class, &acl_bl,
-                        meta.category, meta.remove_objs, meta.user_data, meta.appendable);
-  tracepoint(rgw_rados, complete_exit, req_id.c_str());
-  if (r < 0)
-    goto done_cancel;
-
-  if (meta.mtime) {
-    *meta.mtime = meta.set_mtime;
-  }
-
-  /* note that index_op was using state so we couldn't invalidate it earlier */
-  target->invalidate_state();
-  state = NULL;
-
-  if (versioned_op && meta.olh_epoch) {
-    r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), target->get_target(), false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
-    if (r < 0) {
-      return r;
-    }
-  }
-
-  if (!real_clock::is_zero(meta.delete_at)) {
-    rgw_obj_index_key obj_key;
-    obj.key.get_index_key(&obj_key);
-
-    r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
-                                     obj.bucket.bucket_id, obj_key);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
-      /* ignoring error, nothing we can do at this point */
-    }
-  }
-  meta.canceled = false;
-
-  /* update quota cache */
-  if (meta.completeMultipart){
-       store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
-                                     0, orig_size);
-  }
-  else {
-    store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
-                                     accounted_size, orig_size);  
-  }
-  return 0;
-
-done_cancel:
-  int ret = index_op->cancel(dpp, meta.remove_objs);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
-  }
-
-  meta.canceled = true;
-
-  /* we lost in a race. There are a few options:
-   * - existing object was rewritten (ECANCELED)
-   * - non existing object was created (EEXIST)
-   * - object was removed (ENOENT)
-   * should treat it as a success
-   */
-  if (meta.if_match == NULL && meta.if_nomatch == NULL) {
-    if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
-      r = 0;
-    }
-  } else {
-    if (meta.if_match != NULL) {
-      // only overwrite existing object
-      if (strcmp(meta.if_match, "*") == 0) {
-        if (r == -ENOENT) {
-          r = -ERR_PRECONDITION_FAILED;
-        } else if (r == -ECANCELED) {
-          r = 0;
-        }
-      }
-    }
-
-    if (meta.if_nomatch != NULL) {
-      // only create a new object
-      if (strcmp(meta.if_nomatch, "*") == 0) {
-        if (r == -EEXIST) {
-          r = -ERR_PRECONDITION_FAILED;
-        } else if (r == -ENOENT) {
-          r = 0;
-        }
-      }
-    }
-  }
-
-  return r;
-}
-
-int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
-                                           map<string, bufferlist>& attrs, optional_yield y)
-{
-  RGWBucketInfo& bucket_info = target->get_bucket_info();
-
-  RGWRados::Bucket bop(target->get_store(), bucket_info);
-  RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
-  index_op.set_zones_trace(meta.zones_trace);
-  
-  bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
-  int r;
-  if (assume_noent) {
-    r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
-    if (r == -EEXIST) {
-      assume_noent = false;
-    }
-  }
-  if (!assume_noent) {
-    r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
-  }
-  return r;
-}
-
-class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
-{
-  const DoutPrefixProvider *dpp;
-  CephContext* cct;
-  rgw_obj obj;
-  rgw::sal::DataProcessor *filter;
-  boost::optional<RGWPutObj_Compress>& compressor;
-  bool try_etag_verify;
-  rgw::putobj::etag_verifier_ptr etag_verifier;
-  boost::optional<rgw::putobj::ChunkProcessor> buffering;
-  CompressorRef& plugin;
-  rgw::sal::ObjectProcessor *processor;
-  void (*progress_cb)(off_t, void *);
-  void *progress_data;
-  bufferlist extra_data_bl, manifest_bl;
-  std::optional<RGWCompressionInfo> compression_info;
-  uint64_t extra_data_left{0};
-  bool need_to_process_attrs{true};
-  uint64_t data_len{0};
-  map<string, bufferlist> src_attrs;
-  uint64_t ofs{0};
-  uint64_t lofs{0}; /* logical ofs */
-  std::function<int(map<string, bufferlist>&)> attrs_handler;
-
-public:
-  RGWRadosPutObj(const DoutPrefixProvider *dpp,
-                 CephContext* cct,
-                 CompressorRef& plugin,
-                 boost::optional<RGWPutObj_Compress>& compressor,
-                 rgw::sal::ObjectProcessor *p,
-                 void (*_progress_cb)(off_t, void *),
-                 void *_progress_data,
-                 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
-                       dpp(dpp),
-                       cct(cct),
-                       filter(p),
-                       compressor(compressor),
-                       try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
-                       plugin(plugin),
-                       processor(p),
-                       progress_cb(_progress_cb),
-                       progress_data(_progress_data),
-                       attrs_handler(_attrs_handler) {}
-
-
-  int process_attrs(void) {
-    if (extra_data_bl.length()) {
-      JSONParser jp;
-      if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
-        ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
-        return -EIO;
-      }
-
-      JSONDecoder::decode_json("attrs", src_attrs, &jp);
-
-      auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
-      if (iter != src_attrs.end()) {
-        const bufferlist bl = std::move(iter->second);
-        src_attrs.erase(iter); // don't preserve source compression info
-
-        if (try_etag_verify) {
-          // if we're trying to verify etags, we need to convert compressed
-          // ranges in the manifest back into logical multipart part offsets
-          RGWCompressionInfo info;
-          bool compressed = false;
-          int r = rgw_compression_info_from_attr(bl, compressed, info);
-          if (r < 0) {
-            ldpp_dout(dpp, 4) << "failed to decode compression info, "
-                "disabling etag verification" << dendl;
-            try_etag_verify = false;
-          } else if (compressed) {
-            compression_info = std::move(info);
-          }
-        }
-      }
-      /* We need the manifest to recompute the ETag for verification */
-      iter = src_attrs.find(RGW_ATTR_MANIFEST);
-      if (iter != src_attrs.end()) {
-        manifest_bl = std::move(iter->second);
-        src_attrs.erase(iter);
-      }
-
-      // filter out olh attributes
-      iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
-      while (iter != src_attrs.end()) {
-        if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
-          break;
-        }
-        iter = src_attrs.erase(iter);
-      }
-    }
-
-    int ret = attrs_handler(src_attrs);
-    if (ret < 0) {
-      return ret;
-    }
-
-    if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
-      //do not compress if object is encrypted
-      compressor = boost::in_place(cct, plugin, filter);
-      // add a filter that buffers data so we don't try to compress tiny blocks.
-      // libcurl reads in 16k at a time, and we need at least 64k to get a good
-      // compression ratio
-      constexpr unsigned buffer_size = 512 * 1024;
-      buffering = boost::in_place(&*compressor, buffer_size);
-      filter = &*buffering;
-    }
-
-    /*
-     * Presently we don't support ETag based verification if encryption is
-     * requested. We can enable simultaneous support once we have a mechanism
-     * to know the sequence in which the filters must be applied.
-     */
-    if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
-      ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
-                                              compression_info,
-                                              etag_verifier);
-      if (ret < 0) {
-        ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
-            "disabling etag verification" << dendl;
-      } else {
-        filter = etag_verifier.get();
-      }
-    }
-
-    need_to_process_attrs = false;
-
-    return 0;
-  }
-
-  int handle_data(bufferlist& bl, bool *pause) override {
-    if (progress_cb) {
-      progress_cb(data_len, progress_data);
-    }
-    if (extra_data_left) {
-      uint64_t extra_len = bl.length();
-      if (extra_len > extra_data_left)
-        extra_len = extra_data_left;
-
-      bufferlist extra;
-      bl.splice(0, extra_len, &extra);
-      extra_data_bl.append(extra);
-
-      extra_data_left -= extra_len;
-      if (extra_data_left == 0) {
-        int res = process_attrs();
-        if (res < 0)
-          return res;
-      }
-      ofs += extra_len;
-      if (bl.length() == 0) {
-        return 0;
-      }
-    }
-    if (need_to_process_attrs) {
-      /* need to call process_attrs() even if we don't get any attrs,
-       * need it to call attrs_handler().
-       */
-      int res = process_attrs();
-      if (res < 0) {
-        return res;
-      }
-    }
-
-    ceph_assert(uint64_t(ofs) >= extra_data_len);
-
-    uint64_t size = bl.length();
-    ofs += size;
-
-    const uint64_t lofs = data_len;
-    data_len += size;
-
-    return filter->process(std::move(bl), lofs);
-  }
-
-  int flush() {
-    return filter->process({}, data_len);
-  }
-
-  bufferlist& get_extra_data() { return extra_data_bl; }
-
-  map<string, bufferlist>& get_attrs() { return src_attrs; }
-
-  void set_extra_data_len(uint64_t len) override {
-    extra_data_left = len;
-    RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
-  }
-
-  uint64_t get_data_len() {
-    return data_len;
-  }
-
-  std::string get_verifier_etag() {
-    if (etag_verifier) {
-      etag_verifier->calculate_etag();
-      return etag_verifier->get_calculated_etag();
-    } else {
-      return "";
-    }
-  }
-};
-
-/*
- * prepare attrset depending on attrs_mod.
- */
-static void set_copy_attrs(map<string, bufferlist>& src_attrs,
-                           map<string, bufferlist>& attrs,
-                           RGWRados::AttrsMod attrs_mod)
-{
-  switch (attrs_mod) {
-  case RGWRados::ATTRSMOD_NONE:
-    attrs = src_attrs;
-    break;
-  case RGWRados::ATTRSMOD_REPLACE:
-    if (!attrs[RGW_ATTR_ETAG].length()) {
-      attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
-    }
-    if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
-      auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
-      if (ttiter != src_attrs.end()) {
-        attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
-      }
-    }
-    break;
-  case RGWRados::ATTRSMOD_MERGE:
-    for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
-      if (attrs.find(it->first) == attrs.end()) {
-       attrs[it->first] = it->second;
-      }
-    }
-    break;
-  }
-}
-
-int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y)
-{
-  RGWObjectCtx rctx(this->driver);
-  rgw::sal::Attrs attrset;
-  uint64_t obj_size;
-  ceph::real_time mtime;
-  RGWRados::Object op_target(this, obj->get_bucket(), rctx, obj);
-  RGWRados::Object::Read read_op(&op_target);
-
-  read_op.params.attrs = &attrset;
-  read_op.params.obj_size = &obj_size;
-  read_op.params.lastmod = &mtime;
-
-  int ret = read_op.prepare(y, dpp);
-  if (ret < 0)
-    return ret;
-
-  attrset.erase(RGW_ATTR_ID_TAG);
-  attrset.erase(RGW_ATTR_TAIL_TAG);
-  attrset.erase(RGW_ATTR_STORAGE_CLASS);
-
-  return this->copy_obj_data(rctx, obj->get_bucket(),
-                            obj->get_bucket()->get_info().placement_rule,
-                            read_op, obj_size - 1, obj, NULL, mtime,
-                            attrset, 0, real_time(), NULL, dpp, y);
-}
-
-struct obj_time_weight {
-  real_time mtime;
-  uint32_t zone_short_id;
-  uint64_t pg_ver;
-  bool high_precision;
-
-  obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
-
-  bool compare_low_precision(const obj_time_weight& rhs) {
-    struct timespec l = ceph::real_clock::to_timespec(mtime);
-    struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
-    l.tv_nsec = 0;
-    r.tv_nsec = 0;
-    if (l > r) {
-      return false;
-    }
-    if (l < r) {
-      return true;
-    }
-    if (!zone_short_id || !rhs.zone_short_id) {
-      /* don't compare zone ids, if one wasn't provided */
-      return false;
-    }
-    if (zone_short_id != rhs.zone_short_id) {
-      return (zone_short_id < rhs.zone_short_id);
-    }
-    return (pg_ver < rhs.pg_ver);
-
-  }
-
-  bool operator<(const obj_time_weight& rhs) {
-    if (!high_precision || !rhs.high_precision) {
-      return compare_low_precision(rhs);
-    }
-    if (mtime > rhs.mtime) {
-      return false;
-    }
-    if (mtime < rhs.mtime) {
-      return true;
-    }
-    if (!zone_short_id || !rhs.zone_short_id) {
-      /* don't compare zone ids, if one wasn't provided */
-      return false;
-    }
-    if (zone_short_id != rhs.zone_short_id) {
-      return (zone_short_id < rhs.zone_short_id);
-    }
-    return (pg_ver < rhs.pg_ver);
-  }
-
-  void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
-    mtime = _mtime;
-    zone_short_id = _short_id;
-    pg_ver = _pg_ver;
-  }
-
-  void init(RGWObjState *state) {
-    mtime = state->mtime;
-    zone_short_id = state->zone_short_id;
-    pg_ver = state->pg_ver;
-  }
-};
-
-inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
-  out << o.mtime;
-
-  if (o.zone_short_id != 0 || o.pg_ver != 0) {
-    out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
-  }
-
-  return out;
-}
-
-class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
-  bufferlist extra_data;
-public:
-  RGWGetExtraDataCB() {}
-  int handle_data(bufferlist& bl, bool *pause) override {
-    int bl_len = (int)bl.length();
-    if (extra_data.length() < extra_data_len) {
-      off_t max = extra_data_len - extra_data.length();
-      if (max > bl_len) {
-        max = bl_len;
-      }
-      bl.splice(0, max, &extra_data);
-    }
-    return bl_len;
-  }
-
-  bufferlist& get_extra_data() {
-    return extra_data;
-  }
-};
-
-int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
-               RGWObjectCtx& obj_ctx,
-               const rgw_user& user_id,
-               req_info *info,
-               const rgw_zone_id& source_zone,
-               rgw::sal::Object* src_obj,
-               const RGWBucketInfo *src_bucket_info,
-               real_time *src_mtime,
-               uint64_t *psize,
-               const real_time *mod_ptr,
-               const real_time *unmod_ptr,
-               bool high_precision_time,
-               const char *if_match,
-               const char *if_nomatch,
-               map<string, bufferlist> *pattrs,
-               map<string, string> *pheaders,
-               string *version_id,
-               string *ptag,
-               string *petag)
-{
-  /* source is in a different zonegroup, copy from there */
-
-  RGWRESTStreamRWRequest *in_stream_req;
-  string tag;
-  map<string, bufferlist> src_attrs;
-  append_rand_alpha(cct, tag, tag, 32);
-  obj_time_weight set_mtime_weight;
-  set_mtime_weight.high_precision = high_precision_time;
-
-  RGWRESTConn *conn;
-  if (source_zone.empty()) {
-    if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
-      /* source is in the master zonegroup */
-      conn = svc.zone->get_master_conn();
-    } else {
-      auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
-      map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
-      if (iter == zonegroup_conn_map.end()) {
-        ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
-        return -ENOENT;
-      }
-      conn = iter->second;
-    }
-  } else {
-    auto& zone_conn_map = svc.zone->get_zone_conn_map();
-    auto iter = zone_conn_map.find(source_zone);
-    if (iter == zone_conn_map.end()) {
-      ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
-      return -ENOENT;
-    }
-    conn = iter->second;
-  }
-
-  RGWGetExtraDataCB cb;
-  map<string, string> req_headers;
-  real_time set_mtime;
-
-  const real_time *pmod = mod_ptr;
-
-  obj_time_weight dest_mtime_weight;
-
-  constexpr bool prepend_meta = true;
-  constexpr bool get_op = true;
-  constexpr bool rgwx_stat = true;
-  constexpr bool sync_manifest = true;
-  constexpr bool skip_decrypt = true;
-  int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
-                      dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
-                      prepend_meta, get_op, rgwx_stat,
-                      sync_manifest, skip_decrypt,
-                      true, &cb, &in_stream_req);
-  if (ret < 0) {
-    return ret;
-  }
-
-  ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
-                               nullptr, pheaders, null_yield);
-  if (ret < 0) {
-    return ret;
-  }
-
-  bufferlist& extra_data_bl = cb.get_extra_data();
-  if (extra_data_bl.length()) {
-    JSONParser jp;
-    if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
-      ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
-      return -EIO;
-    }
-
-    JSONDecoder::decode_json("attrs", src_attrs, &jp);
-
-    src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
-  }
-
-  if (src_mtime) {
-    *src_mtime = set_mtime;
-  }
-
-  if (petag) {
-    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
-    if (iter != src_attrs.end()) {
-      bufferlist& etagbl = iter->second;
-      *petag = etagbl.to_str();
-      while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
-        *petag = petag->substr(0, petag->size() - 1);
-      }
-    }
-  }
-
-  if (pattrs) {
-    *pattrs = std::move(src_attrs);
-  }
-
-  return 0;
-}
-
-int RGWFetchObjFilter_Default::filter(CephContext *cct,
-                                      const rgw_obj_key& source_key,
-                                      const RGWBucketInfo& dest_bucket_info,
-                                      std::optional<rgw_placement_rule> dest_placement_rule,
-                                      const map<string, bufferlist>& obj_attrs,
-                                     std::optional<rgw_user> *poverride_owner,
-                                      const rgw_placement_rule **prule)
-{
-  const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
-  if (!ptail_rule) {
-    auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
-    if (iter != obj_attrs.end()) {
-      dest_rule.storage_class = iter->second.to_str();
-      dest_rule.inherit_from(dest_bucket_info.placement_rule);
-      ptail_rule = &dest_rule;
-    } else {
-      ptail_rule = &dest_bucket_info.placement_rule;
-    }
-  }
-  *prule = ptail_rule;
-  return 0;
-}
-
-int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
-               const rgw_user& user_id,
-               req_info *info,
-               const rgw_zone_id& source_zone,
-               rgw::sal::Object* dest_obj,
-               rgw::sal::Object* src_obj,
-               rgw::sal::Bucket* dest_bucket,
-               rgw::sal::Bucket* src_bucket,
-               std::optional<rgw_placement_rule> dest_placement_rule,
-               real_time *src_mtime,
-               real_time *mtime,
-               const real_time *mod_ptr,
-               const real_time *unmod_ptr,
-               bool high_precision_time,
-               const char *if_match,
-               const char *if_nomatch,
-               AttrsMod attrs_mod,
-               bool copy_if_newer,
-               rgw::sal::Attrs& attrs,
-               RGWObjCategory category,
-               std::optional<uint64_t> olh_epoch,
-              real_time delete_at,
-               string *ptag,
-               string *petag,
-               void (*progress_cb)(off_t, void *),
-               void *progress_data,
-               const DoutPrefixProvider *dpp,
-               RGWFetchObjFilter *filter,
-               rgw_zone_set *zones_trace,
-               std::optional<uint64_t>* bytes_transferred)
-{
-  /* source is in a different zonegroup, copy from there */
-
-  RGWRESTStreamRWRequest *in_stream_req;
-  string tag;
-  int i;
-  append_rand_alpha(cct, tag, tag, 32);
-  obj_time_weight set_mtime_weight;
-  set_mtime_weight.high_precision = high_precision_time;
-  int ret;
-
-  rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
-  using namespace rgw::putobj;
-  AtomicObjectProcessor processor(&aio, this->driver, nullptr, user_id,
-                                  obj_ctx, dest_obj->clone(), olh_epoch,
-                                 tag, dpp, null_yield);
-  RGWRESTConn *conn;
-  auto& zone_conn_map = svc.zone->get_zone_conn_map();
-  auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
-  if (source_zone.empty()) {
-    if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
-      /* source is in the master zonegroup */
-      conn = svc.zone->get_master_conn();
-    } else {
-      map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
-      if (iter == zonegroup_conn_map.end()) {
-        ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
-        return -ENOENT;
-      }
-      conn = iter->second;
-    }
-  } else {
-    auto iter = zone_conn_map.find(source_zone);
-    if (iter == zone_conn_map.end()) {
-      ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
-      return -ENOENT;
-    }
-    conn = iter->second;
-  }
-
-  boost::optional<RGWPutObj_Compress> compressor;
-  CompressorRef plugin;
-
-  RGWFetchObjFilter_Default source_filter;
-  if (!filter) {
-    filter = &source_filter;
-  }
-
-  std::optional<rgw_user> override_owner;
-
-  RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
-                    [&](map<string, bufferlist>& obj_attrs) {
-                      const rgw_placement_rule *ptail_rule;
-
-                      int ret = filter->filter(cct,
-                                               src_obj->get_key(),
-                                               dest_bucket->get_info(),
-                                               dest_placement_rule,
-                                               obj_attrs,
-                                              &override_owner,
-                                               &ptail_rule);
-                      if (ret < 0) {
-                        ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
-                        return ret;
-                      }
-
-                      processor.set_tail_placement(*ptail_rule);
-
-                      const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
-                      if (compression_type != "none") {
-                        plugin = Compressor::create(cct, compression_type);
-                        if (!plugin) {
-                          ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
-                                        << compression_type << dendl;
-                        }
-                      }
-
-                      ret = processor.prepare(null_yield);
-                      if (ret < 0) {
-                        return ret;
-                      }
-                      return 0;
-                    });
-
-  string etag;
-  real_time set_mtime;
-  uint64_t expected_size = 0;
-
-  RGWObjState *dest_state = NULL;
-  RGWObjManifest *manifest = nullptr;
-
-  const real_time *pmod = mod_ptr;
-
-  obj_time_weight dest_mtime_weight;
-
-  if (copy_if_newer) {
-    /* need to get mtime for destination */
-    ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
-    if (ret < 0)
-      goto set_err_state;
-
-    if (!real_clock::is_zero(dest_state->mtime)) {
-      dest_mtime_weight.init(dest_state);
-      pmod = &dest_mtime_weight.mtime;
-    }
-  }
-
-  static constexpr bool prepend_meta = true;
-  static constexpr bool get_op = true;
-  static constexpr bool rgwx_stat = false;
-  static constexpr bool sync_manifest = true;
-  static constexpr bool skip_decrypt = true;
-  ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
-                      dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
-                      prepend_meta, get_op, rgwx_stat,
-                      sync_manifest, skip_decrypt,
-                      true,
-                      &cb, &in_stream_req);
-  if (ret < 0) {
-    goto set_err_state;
-  }
-
-  ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
-                               &expected_size, nullptr, nullptr, null_yield);
-  if (ret < 0) {
-    goto set_err_state;
-  }
-  ret = cb.flush();
-  if (ret < 0) {
-    goto set_err_state;
-  }
-  if (cb.get_data_len() != expected_size) {
-    ret = -EIO;
-    ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
-        << expected_size << " bytes but received " << cb.get_data_len() << dendl;
-    goto set_err_state;
-  }
-  if (compressor && compressor->is_compressed()) {
-    bufferlist tmp;
-    RGWCompressionInfo cs_info;
-    cs_info.compression_type = plugin->get_type_name();
-    cs_info.orig_size = cb.get_data_len();
-    cs_info.compressor_message = compressor->get_compressor_message();
-    cs_info.blocks = move(compressor->get_compression_blocks());
-    encode(cs_info, tmp);
-    cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
-  }
-
-  if (override_owner) {
-    processor.set_owner(*override_owner);
-
-    auto& obj_attrs = cb.get_attrs();
-
-    RGWUserInfo owner_info;
-    if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
-      ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
-      return -EINVAL;
-    }
-
-    RGWAccessControlPolicy acl;
-
-    auto aiter = obj_attrs.find(RGW_ATTR_ACL);
-    if (aiter == obj_attrs.end()) {
-      ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
-      acl.create_default(owner_info.user_id, owner_info.display_name);
-    } else {
-      auto iter = aiter->second.cbegin();
-      try {
-       acl.decode(iter);
-      } catch (buffer::error& err) {
-       ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
-       return -EIO;
-      }
-    }
-
-    ACLOwner new_owner;
-    new_owner.set_id(*override_owner);
-    new_owner.set_name(owner_info.display_name);
-
-    acl.set_owner(new_owner);
-
-    bufferlist bl;
-    acl.encode(bl);
-    obj_attrs[RGW_ATTR_ACL] = std::move(bl);
-  }
-
-  if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
-    cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
-  } else {
-    map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
-    if (iter != cb.get_attrs().end()) {
-      try {
-        decode(delete_at, iter->second);
-      } catch (buffer::error& err) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
-      }
-    }
-  }
-
-  if (src_mtime) {
-    *src_mtime = set_mtime;
-  }
-
-  if (petag) {
-    const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
-    if (iter != cb.get_attrs().end()) {
-      *petag = iter->second.to_str();
-    }
-  }
-
-  //erase the append attr
-  cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
-
-  { // add x-amz-replication-status=REPLICA
-    auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS];
-    bl.clear(); // overwrite source's status
-    bl.append("REPLICA");
-  }
-
-  if (source_zone.empty()) {
-    set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
-  } else {
-    attrs = cb.get_attrs();
-  }
-
-  if (copy_if_newer) {
-    uint64_t pg_ver = 0;
-    auto i = attrs.find(RGW_ATTR_PG_VER);
-    if (i != attrs.end() && i->second.length() > 0) {
-      auto iter = i->second.cbegin();
-      try {
-        decode(pg_ver, iter);
-      } catch (buffer::error& err) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
-        /* non critical error */
-      }
-    }
-    set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
-  }
-
-  /* Perform ETag verification is we have computed the object's MD5 sum at our end */
-  if (const auto& verifier_etag = cb.get_verifier_etag();
-      !verifier_etag.empty()) {
-    string trimmed_etag = etag;
-
-    /* Remove the leading and trailing double quotes from etag */
-    trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
-      trimmed_etag.end());
-
-    if (verifier_etag != trimmed_etag) {
-      ret = -EIO;
-      ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
-        << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
-      goto set_err_state;
-    }
-  }
-
-#define MAX_COMPLETE_RETRY 100
-  for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
-    bool canceled = false;
-    ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
-                             attrs, delete_at, nullptr, nullptr, nullptr,
-                             zones_trace, &canceled, null_yield);
-    if (ret < 0) {
-      goto set_err_state;
-    }
-
-    if (copy_if_newer && canceled) {
-      ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
-      obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
-      ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
-        goto set_err_state;
-      }
-      dest_mtime_weight.init(dest_state);
-      dest_mtime_weight.high_precision = high_precision_time;
-      if (!dest_state->exists ||
-        dest_mtime_weight < set_mtime_weight) {
-        ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
-        continue;
-      } else {
-        ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
-      }
-    }
-    break;
-  }
-
-  if (i == MAX_COMPLETE_RETRY) {
-    ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
-    ret = -EIO;
-    goto set_err_state;
-  }
-
-  if (bytes_transferred) {
-    *bytes_transferred = cb.get_data_len();
-  }
-  return 0;
-set_err_state:
-  if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
-    // we may have already fetched during sync of OP_ADD, but were waiting
-    // for OP_LINK_OLH to call set_olh() with a real olh_epoch
-    if (olh_epoch && *olh_epoch > 0) {
-      constexpr bool log_data_change = true;
-      ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj, false, nullptr,
-                    *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
-    } else {
-      // we already have the latest copy
-      ret = 0;
-    }
-  }
-  return ret;
-}
-
-
-int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
-                                      RGWObjState *astate,
-                                      map<string, bufferlist>& src_attrs,
-                                      RGWRados::Object::Read& read_op,
-                                      const rgw_user& user_id,
-                                      rgw::sal::Object* dest_obj,
-                                      real_time *mtime)
-{
-  string etag;
-
-  RGWRESTStreamS3PutObj *out_stream_req;
-
-  auto rest_master_conn = svc.zone->get_master_conn();
-
-  int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
-  if (ret < 0) {
-    return ret;
-  }
-
-  out_stream_req->set_send_length(astate->size);
-
-  ret = RGWHTTP::send(out_stream_req);
-  if (ret < 0) {
-    delete out_stream_req;
-    return ret;
-  }
-
-  ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
-  if (ret < 0) {
-    delete out_stream_req;
-    return ret;
-  }
-
-  ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-/**
- * Copy an object.
- * dest_obj: the object to copy into
- * src_obj: the object to copy from
- * attrs: usage depends on attrs_mod parameter
- * attrs_mod: the modification mode of the attrs, may have the following values:
- *            ATTRSMOD_NONE - the attributes of the source object will be
- *                            copied without modifications, attrs parameter is ignored;
- *            ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
- *                               parameter, source object attributes are not copied;
- *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
- *                             are overwritten by values contained in attrs parameter.
- * err: stores any errors resulting from the get of the original object
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
-               const rgw_user& user_id,
-               req_info *info,
-               const rgw_zone_id& source_zone,
-               rgw::sal::Object* dest_obj,
-               rgw::sal::Object* src_obj,
-               rgw::sal::Bucket* dest_bucket,
-               rgw::sal::Bucket* src_bucket,
-               const rgw_placement_rule& dest_placement,
-               real_time *src_mtime,
-               real_time *mtime,
-               const real_time *mod_ptr,
-               const real_time *unmod_ptr,
-               bool high_precision_time,
-               const char *if_match,
-               const char *if_nomatch,
-               AttrsMod attrs_mod,
-               bool copy_if_newer,
-               rgw::sal::Attrs& attrs,
-               RGWObjCategory category,
-               uint64_t olh_epoch,
-              real_time delete_at,
-               string *version_id,
-               string *ptag,
-               string *petag,
-               void (*progress_cb)(off_t, void *),
-               void *progress_data,
-               const DoutPrefixProvider *dpp,
-               optional_yield y)
-{
-  int ret;
-  uint64_t obj_size;
-  rgw_obj shadow_obj = dest_obj->get_obj();
-  string shadow_oid;
-
-  bool remote_src;
-  bool remote_dest;
-
-  append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
-  shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
-
-  auto& zonegroup = svc.zone->get_zonegroup();
-
-  remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
-  remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
-
-  if (remote_src && remote_dest) {
-    ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
-    return -EINVAL;
-  }
-
-  ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
-
-  if (remote_src || !source_zone.empty()) {
-    return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
-               dest_obj, src_obj, dest_bucket, src_bucket,
-               dest_placement, src_mtime, mtime, mod_ptr,
-               unmod_ptr, high_precision_time,
-               if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
-               olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
-               nullptr /* filter */);
-  }
-
-  map<string, bufferlist> src_attrs;
-  RGWRados::Object src_op_target(this, src_bucket, obj_ctx, src_obj);
-  RGWRados::Object::Read read_op(&src_op_target);
-
-  read_op.conds.mod_ptr = mod_ptr;
-  read_op.conds.unmod_ptr = unmod_ptr;
-  read_op.conds.high_precision_time = high_precision_time;
-  read_op.conds.if_match = if_match;
-  read_op.conds.if_nomatch = if_nomatch;
-  read_op.params.attrs = &src_attrs;
-  read_op.params.lastmod = src_mtime;
-  read_op.params.obj_size = &obj_size;
-
-  ret = read_op.prepare(y, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-  if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
-    // Current implementation does not follow S3 spec and even
-    // may result in data corruption silently when copying
-    // multipart objects acorss pools. So reject COPY operations
-    //on encrypted objects before it is fully functional.
-    ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
-                  << " has not been implemented." << dendl;
-    return -ERR_NOT_IMPLEMENTED;
-  }
-
-  src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
-  src_attrs.erase(RGW_ATTR_DELETE_AT);
-
-  src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
-  src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
-  map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
-  if (rt != attrs.end())
-    src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
-  map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
-  if (lh != attrs.end())
-    src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
-
-  set_copy_attrs(src_attrs, attrs, attrs_mod);
-  attrs.erase(RGW_ATTR_ID_TAG);
-  attrs.erase(RGW_ATTR_PG_VER);
-  attrs.erase(RGW_ATTR_SOURCE_ZONE);
-  map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
-  if (cmp != src_attrs.end())
-    attrs[RGW_ATTR_COMPRESSION] = cmp->second;
-
-  RGWObjManifest manifest;
-  RGWObjState *astate = NULL;
-  RGWObjManifest *amanifest = nullptr;
-
-  ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj, &astate, &amanifest, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  vector<rgw_raw_obj> ref_objs;
-
-  if (remote_dest) {
-    /* dest is in a different zonegroup, copy it there */
-    return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
-  }
-  uint64_t max_chunk_size;
-
-  ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
-    return ret;
-  }
-
-  rgw_pool src_pool;
-  rgw_pool dest_pool;
-
-  const rgw_placement_rule *src_rule{nullptr};
-
-  if (amanifest) {
-    src_rule = &amanifest->get_tail_placement().placement_rule;
-    ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
-  }
-
-  if (!src_rule || src_rule->empty()) {
-    src_rule = &src_bucket->get_placement_rule();
-  }
-
-  if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
-    return -EIO;
-  }
-
-  if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
-    return -EIO;
-  }
-
-  ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
-                             << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
-
-  bool copy_data = (!amanifest) ||
-    (*src_rule != dest_placement) ||
-    (src_pool != dest_pool);
-
-  bool copy_first = false;
-  if (amanifest) {
-    if (!amanifest->has_tail()) {
-      copy_data = true;
-    } else {
-      uint64_t head_size = amanifest->get_head_size();
-
-      if (head_size > 0) {
-        if (head_size > max_chunk_size) {
-          copy_data = true;
-        } else {
-          copy_first = true;
-        }
-      }
-    }
-  }
-
-  if (petag) {
-    const auto iter = attrs.find(RGW_ATTR_ETAG);
-    if (iter != attrs.end()) {
-      *petag = iter->second.to_str();
-    }
-  }
-
-  if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-    attrs.erase(RGW_ATTR_TAIL_TAG);
-    return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
-                         mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
-  }
-
-  /* This has been in for 2 years, so we can safely assume amanifest is not NULL */
-  RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp);
-
-  if (copy_first) { // we need to copy first chunk, not increase refcount
-    ++miter;
-  }
-
-  bufferlist first_chunk;
-
-  const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj());
-  RGWObjManifest *pmanifest; 
-  ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
-
-  RGWRados::Object dest_op_target(this, dest_bucket, obj_ctx, dest_obj);
-  RGWRados::Object::Write write_op(&dest_op_target);
-
-  string tag;
-
-  if (ptag) {
-    tag = *ptag;
-  }
-
-  if (tag.empty()) {
-    append_rand_alpha(cct, tag, tag, 32);
-  }
-
-  std::unique_ptr<rgw::Aio> aio;
-  rgw::AioResultList all_results;
-  if (!copy_itself) {
-    aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y);
-    attrs.erase(RGW_ATTR_TAIL_TAG);
-    manifest = *amanifest;
-    const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
-    if (tail_placement.bucket.name.empty()) {
-      manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
-    }
-    string ref_tag;
-    for (; miter != amanifest->obj_end(dpp); ++miter) {
-      ObjectWriteOperation op;
-      ref_tag = tag + '\0';
-      cls_refcount_get(op, ref_tag, true);
-
-      auto obj = svc.rados->obj(miter.get_location().get_raw_obj(driver));
-      ret = obj.open(dpp);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl;
-        goto done_ret;
-      }
-
-      static constexpr uint64_t cost = 1; // 1 throttle unit per request
-      static constexpr uint64_t id = 0; // ids unused
-      rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
-      ret = rgw::check_for_errors(completed);
-      all_results.splice(all_results.end(), completed);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl;
-        goto done_ret;
-      }
-    }
-
-    rgw::AioResultList completed = aio->drain();
-    ret = rgw::check_for_errors(completed);
-    all_results.splice(all_results.end(), completed);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <<dendl;
-      goto done_ret;
-    }
-
-    pmanifest = &manifest;
-  } else {
-    pmanifest = amanifest;
-    /* don't send the object's tail for garbage collection */
-    astate->keep_tail = true;
-  }
-
-  if (copy_first) {
-    ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
-    if (ret < 0) {
-      goto done_ret;
-    }
-
-    pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
-  } else {
-    pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
-  }
-
-  write_op.meta.data = &first_chunk;
-  write_op.meta.manifest = pmanifest;
-  write_op.meta.ptag = &tag;
-  write_op.meta.owner = dest_bucket->get_info().owner;
-  write_op.meta.mtime = mtime;
-  write_op.meta.flags = PUT_OBJ_CREATE;
-  write_op.meta.category = category;
-  write_op.meta.olh_epoch = olh_epoch;
-  write_op.meta.delete_at = delete_at;
-  write_op.meta.modify_tail = !copy_itself;
-
-  ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
-  if (ret < 0) {
-    goto done_ret;
-  }
-
-  return 0;
-
-done_ret:
-  if (!copy_itself) {
-
-    /* wait all pending op done */
-    rgw::AioResultList completed = aio->drain();
-    all_results.splice(all_results.end(), completed);
-
-    /* rollback reference */
-    string ref_tag = tag + '\0';
-    int ret2 = 0;
-    for (auto& r : all_results) {
-      if (r.result < 0) {
-        continue; // skip errors
-      }
-      ObjectWriteOperation op;
-      cls_refcount_put(op, ref_tag, true);
-
-      static constexpr uint64_t cost = 1; // 1 throttle unit per request
-      static constexpr uint64_t id = 0; // ids unused
-      rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
-      ret2 = rgw::check_for_errors(completed);
-      if (ret2 < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl;
-      }
-    }
-    completed = aio->drain();
-    ret2 = rgw::check_for_errors(completed);
-    if (ret2 < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <<dendl;
-    }
-  }
-  return ret;
-}
-
-
-int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
-               rgw::sal::Bucket* bucket,
-               const rgw_placement_rule& dest_placement,
-              RGWRados::Object::Read& read_op, off_t end,
-               rgw::sal::Object* dest_obj,
-              real_time *mtime,
-              real_time set_mtime,
-               rgw::sal::Attrs& attrs,
-               uint64_t olh_epoch,
-              real_time delete_at,
-               string *petag,
-               const DoutPrefixProvider *dpp,
-               optional_yield y)
-{
-  string tag;
-  append_rand_alpha(cct, tag, tag, 32);
-
-  rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
-  using namespace rgw::putobj;
-  // do not change the null_yield in the initialization of this AtomicObjectProcessor
-  // it causes crashes in the ragweed tests
-  AtomicObjectProcessor processor(&aio, this->driver, &dest_placement,
-                                  bucket->get_info().owner, obj_ctx,
-                                  dest_obj->clone(), olh_epoch, tag,
-                                 dpp, null_yield);
-  int ret = processor.prepare(y);
-  if (ret < 0)
-    return ret;
-
-  off_t ofs = 0;
-
-  do {
-    bufferlist bl;
-    ret = read_op.read(ofs, end, bl, y, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
-      return ret;
-    }
-
-    uint64_t read_len = ret;
-    ret = processor.process(std::move(bl), ofs);
-    if (ret < 0) {
-      return ret;
-    }
-
-    ofs += read_len;
-  } while (ofs <= end);
-
-  // flush
-  ret = processor.process({}, ofs);
-  if (ret < 0) {
-    return ret;
-  }
-
-  string etag;
-  auto iter = attrs.find(RGW_ATTR_ETAG);
-  if (iter != attrs.end()) {
-    bufferlist& bl = iter->second;
-    etag = bl.to_str();
-    if (petag) {
-      *petag = etag;
-    }
-  }
-
-  uint64_t accounted_size;
-  {
-    bool compressed{false};
-    RGWCompressionInfo cs_info;
-    ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
-      return ret;
-    }
-    // pass original size if compressed
-    accounted_size = compressed ? cs_info.orig_size : ofs;
-  }
-
-  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
-                            nullptr, nullptr, nullptr, nullptr, nullptr, y);
-}
-
-int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
-                            rgw::sal::Bucket* bucket,
-                             rgw::sal::Object& obj,
-                             const rgw_placement_rule& placement_rule,
-                             const real_time& mtime,
-                             uint64_t olh_epoch,
-                             const DoutPrefixProvider *dpp,
-                             optional_yield y)
-{
-  rgw::sal::Attrs attrs;
-  real_time read_mtime;
-  uint64_t obj_size;
-
-  obj.set_atomic();
-  RGWRados::Object op_target(this, bucket, obj_ctx, &obj);
-  RGWRados::Object::Read read_op(&op_target);
-
-  read_op.params.attrs = &attrs;
-  read_op.params.lastmod = &read_mtime;
-  read_op.params.obj_size = &obj_size;
-
-  int ret = read_op.prepare(y, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (read_mtime != mtime) {
-    /* raced */
-    return -ECANCELED;
-  }
-
-  attrs.erase(RGW_ATTR_ID_TAG);
-  attrs.erase(RGW_ATTR_TAIL_TAG);
-
-  ret = copy_obj_data(obj_ctx,
-                      bucket,
-                      placement_rule,
-                      read_op,
-                      obj_size - 1,
-                      &obj,
-                      nullptr /* pmtime */,
-                      mtime,
-                      attrs,
-                      olh_epoch,
-                      real_time(),
-                      nullptr /* petag */,
-                      dpp,
-                      y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
-{
-  constexpr uint NUM_ENTRIES = 1000u;
-
-  rgw_obj_index_key marker;
-  string prefix;
-  bool is_truncated;
-
-  do {
-    std::vector<rgw_bucket_dir_entry> ent_list;
-    ent_list.reserve(NUM_ENTRIES);
-
-    int r = cls_bucket_list_unordered(dpp,
-                                      bucket_info,
-                                      bucket_info.layout.current_index,
-                                      RGW_NO_SHARD,
-                                     marker,
-                                     prefix,
-                                     NUM_ENTRIES,
-                                     true,
-                                     ent_list,
-                                     &is_truncated,
-                                     &marker,
-                                      y);
-    if (r < 0) {
-      return r;
-    }
-
-    string ns;
-    for (auto const& dirent : ent_list) {
-      rgw_obj_key obj;
-
-      if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
-        return -ENOTEMPTY;
-      }
-    }
-  } while (is_truncated);
-
-  return 0;
-}
-  
-/**
- * Delete a bucket.
- * bucket: the name of the bucket to delete
- * Returns 0 on success, -ERR# otherwise.
- */
-int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
-{
-  const rgw_bucket& bucket = bucket_info.bucket;
-  RGWSI_RADOS::Pool index_pool;
-  map<int, string> bucket_objs;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
-  if (r < 0)
-    return r;
-  
-  if (check_empty) {
-    r = check_bucket_empty(dpp, bucket_info, y);
-    if (r < 0) {
-      return r;
-    }
-  }
-
-  bool remove_ep = true;
-
-  if (objv_tracker.read_version.empty()) {
-    RGWBucketEntryPoint ep;
-    r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
-                                                &ep,
-                                               null_yield,
-                                                dpp,
-                                                RGWBucketCtl::Bucket::GetParams()
-                                                .set_objv_tracker(&objv_tracker));
-    if (r < 0 ||
-        (!bucket_info.bucket.bucket_id.empty() &&
-         ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
-      if (r != -ENOENT) {
-        ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
-        /* we have no idea what caused the error, will not try to remove it */
-      }
-      /* 
-       * either failed to read bucket entrypoint, or it points to a different bucket instance than
-       * requested
-       */
-      remove_ep = false;
-    }
-  }
- 
-  if (remove_ep) {
-    r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
-                                                  RGWBucketCtl::Bucket::RemoveParams()
-                                                  .set_objv_tracker(&objv_tracker));
-    if (r < 0)
-      return r;
-  }
-
-  /* if the bucket is not synced we can remove the meta file */
-  if (!svc.zone->is_syncing_bucket_meta(bucket)) {
-    RGWObjVersionTracker objv_tracker;
-    r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
-    if (r < 0) {
-      return r;
-    }
-
-   /* remove bucket index objects asynchronously by best effort */
-    (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
-                                      bucket_objs,
-                                      cct->_conf->rgw_bucket_index_max_aio)();
-  }
-
-  return 0;
-}
-
-int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
-{
-  RGWBucketInfo info;
-  map<string, bufferlist> attrs;
-  int r;
-
-  if (bucket.bucket_id.empty()) {
-    r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
-  } else {
-    r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp);
-  }
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
-    return r;
-  }
-
-  info.owner = owner.get_id();
-
-  r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-
-int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
-{
-  int ret = 0;
-
-  vector<rgw_bucket>::iterator iter;
-
-  for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
-    rgw_bucket& bucket = *iter;
-    if (enabled) {
-      ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
-    } else {
-      ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
-    }
-
-    RGWBucketInfo info;
-    map<string, bufferlist> attrs;
-    int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
-      ret = r;
-      continue;
-    }
-    if (enabled) {
-      info.flags &= ~BUCKET_SUSPENDED;
-    } else {
-      info.flags |= BUCKET_SUSPENDED;
-    }
-
-    r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
-      ret = r;
-      continue;
-    }
-  }
-  return ret;
-}
-
-int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
-{
-  RGWBucketInfo bucket_info;
-  int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
-  return 0;
-}
-
-int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
-{
-  if ((!manifest)|| state->keep_tail)
-    return 0;
-
-  cls_rgw_obj_chain chain;
-  store->update_gc_chain(dpp, obj->get_obj(), *manifest, &chain);
-
-  if (chain.empty()) {
-    return 0;
-  }
-
-  string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
-  if (store->gc == nullptr) {
-    ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
-    //Delete objects inline just in case gc hasn't been initialised, prevents crashes
-    store->delete_objs_inline(dpp, chain, tag);
-  } else {
-    auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
-    if (ret < 0 && leftover_chain) {
-      //Delete objects inline if send chain to gc fails
-      store->delete_objs_inline(dpp, *leftover_chain, tag);
-    }
-  }
-  return 0;
-}
-
-void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
-{
-  RGWObjManifest::obj_iterator iter;
-  rgw_raw_obj raw_head;
-  obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
-  for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
-    const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(driver);
-    if (mobj == raw_head)
-      continue;
-    cls_rgw_obj_key key(mobj.oid);
-    chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
-  }
-}
-
-std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
-{
-  if (chain.empty()) {
-    return {0, std::nullopt};
-  }
-
-  return gc->send_split_chain(chain, tag);
-}
-
-void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
-{
-  string last_pool;
-  std::unique_ptr<IoCtx> ctx(new IoCtx);
-  int ret = 0;
-  for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
-    cls_rgw_obj& obj = *liter;
-    if (obj.pool != last_pool) {
-      ctx.reset(new IoCtx);
-      ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
-      if (ret < 0) {
-        last_pool = "";
-        ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
-        obj.pool << dendl;
-        continue;
-      }
-      last_pool = obj.pool;
-    }
-    ctx->locator_set_key(obj.loc);
-    const string& oid = obj.key.name; /* just stored raw oid there */
-    ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
-    ":" << obj.key.name << dendl;
-    ObjectWriteOperation op;
-    cls_refcount_put(op, tag, true);
-    ret = ctx->operate(oid, &op);
-    if (ret < 0) {
-      ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
-    }
-  }
-}
-
-static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
-                                 map<RGWObjCategory, RGWStorageStats>& stats)
-{
-  for (const auto& pair : header.stats) {
-    const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
-    const rgw_bucket_category_stats& header_stats = pair.second;
-
-    RGWStorageStats& s = stats[category];
-
-    s.category = category;
-    s.size += header_stats.total_size;
-    s.size_rounded += header_stats.total_size_rounded;
-    s.size_utilized += header_stats.actual_size;
-    s.num_objects += header_stats.num_entries;
-  }
-}
-
-int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
-                                map<RGWObjCategory, RGWStorageStats> *existing_stats,
-                                map<RGWObjCategory, RGWStorageStats> *calculated_stats)
-{
-  RGWSI_RADOS::Pool index_pool;
-
-  // key - bucket index object id
-  // value - bucket index check OP returned result with the given bucket index object (shard)
-  map<int, string> oids;
-
-  int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr);
-  if (ret < 0) {
-    return ret;
-  }
-
-  // declare and pre-populate
-  map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
-  for (auto& iter : oids) {
-    bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
-  }
-
-  ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
-  if (ret < 0) {
-    return ret;
-  }
-
-  // aggregate results (from different shards if there are any)
-  for (const auto& iter : bucket_objs_ret) {
-    accumulate_raw_stats(iter.second.existing_header, *existing_stats);
-    accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
-  }
-
-  return 0;
-}
-
-int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
-{
-  RGWSI_RADOS::Pool index_pool;
-  map<int, string> bucket_objs;
-
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
-}
-
-int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
-{
-  RGWSI_RADOS::Pool index_pool;
-  map<int, string> bucket_objs;
-
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-      ": unable to open bucket index, r=" << r << " (" <<
-      cpp_strerror(-r) << ")" << dendl;
-    return r;
-  }
-
-  r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-      ": unable to issue set bucket resharding, r=" << r << " (" <<
-      cpp_strerror(-r) << ")" << dendl;
-  }
-  return r;
-}
-
-int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y)
-{
-  std::string oid, key;
-  get_obj_bucket_and_oid_loc(obj->get_obj(), oid, key);
-  if (!rctx)
-    return 0;
-
-  RGWObjState *state = NULL;
-  RGWObjManifest *manifest = nullptr;
-
-  int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y);
-  if (r < 0)
-    return r;
-
-  if (!state->is_atomic) {
-    ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
-    return -EINVAL;
-  }
-
-  string tag;
-
-  if (state->tail_tag.length() > 0) {
-    tag = state->tail_tag.c_str();
-  } else if (state->obj_tag.length() > 0) {
-    tag = state->obj_tag.c_str();
-  } else {
-    ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
-    return -EINVAL;
-  }
-
-  ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
-
-  cls_rgw_obj_chain chain;
-  update_gc_chain(dpp, state->obj, *manifest, &chain);
-  return gc->async_defer_chain(tag, chain);
-}
-
-void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
-{
-  list<string> prefixes;
-  prefixes.push_back(RGW_ATTR_OLH_PREFIX);
-  cls_rgw_remove_obj(op, prefixes);
-}
-
-void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
-{
-  cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
-}
-
-void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
-{
-  cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
-}
-
-struct tombstone_entry {
-  ceph::real_time mtime;
-  uint32_t zone_short_id;
-  uint64_t pg_ver;
-
-  tombstone_entry() = default;
-  explicit tombstone_entry(const RGWObjState& state)
-    : mtime(state.mtime), zone_short_id(state.zone_short_id),
-      pg_ver(state.pg_ver) {}
-};
-
-/**
- * Delete an object.
- * bucket: name of the bucket storing the object
- * obj: name of the object to delete
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
-{
-  RGWRados *store = target->get_store();
-  const string& instance = target->get_instance();
-  rgw_obj obj = target->get_obj();
-
-  if (instance == "null") {
-    obj.key.instance.clear();
-  }
-
-  bool explicit_marker_version = (!params.marker_version_id.empty());
-
-  if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
-    if (instance.empty() || explicit_marker_version) {
-      std::unique_ptr<rgw::sal::Object> marker = target->get_target()->clone();
-      marker->clear_instance();
-
-      if (!params.marker_version_id.empty()) {
-        if (params.marker_version_id != "null") {
-          marker->set_instance(params.marker_version_id);
-        }
-      } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
-       marker->gen_rand_obj_instance_name();
-      }
-
-      result.version_id = marker->get_instance();
-      if (result.version_id.empty())
-        result.version_id = "null";
-      result.delete_marker = true;
-
-      struct rgw_bucket_dir_entry_meta meta;
-
-      meta.owner = params.obj_owner.get_id().to_str();
-      meta.owner_display_name = params.obj_owner.get_display_name();
-
-      if (real_clock::is_zero(params.mtime)) {
-        meta.mtime = real_clock::now();
-      } else {
-        meta.mtime = params.mtime;
-      }
-
-      int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker.get(), true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
-      if (r < 0) {
-        return r;
-      }
-    } else {
-      rgw_bucket_dir_entry dirent;
-
-      int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
-      if (r < 0) {
-        return r;
-      }
-      result.delete_marker = dirent.is_delete_marker();
-      r = store->unlink_obj_instance(dpp, target->get_bucket_info(), target->get_target(), params.olh_epoch, y, params.zones_trace);
-      if (r < 0) {
-        return r;
-      }
-      result.version_id = instance;
-    }
-
-    BucketShard *bs = nullptr;
-    int r = target->get_bucket_shard(&bs, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
-      return r;
-    }
-
-    add_datalog_entry(dpp, store->svc.datalog_rados,
-                      target->get_bucket_info(), bs->shard_id);
-
-    return 0;
-  }
-
-  rgw_rados_ref ref;
-  int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  RGWObjState *state;
-  RGWObjManifest *manifest = nullptr;
-  r = target->get_state(dpp, &state, &manifest, false, y);
-  if (r < 0)
-    return r;
-
-  ObjectWriteOperation op;
-
-  if (!real_clock::is_zero(params.unmod_since)) {
-    struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
-    struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
-    if (!params.high_precision_time) {
-      ctime.tv_nsec = 0;
-      unmod.tv_nsec = 0;
-    }
-
-    ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
-    if (ctime > unmod) {
-      return -ERR_PRECONDITION_FAILED;
-    }
-
-    /* only delete object if mtime is less than or equal to params.unmod_since */
-    store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
-  }
-  uint64_t obj_accounted_size = state->accounted_size;
-
-  if(params.abortmp) {
-    obj_accounted_size = params.parts_accounted_size;
-  }
-
-  if (!real_clock::is_zero(params.expiration_time)) {
-    bufferlist bl;
-    real_time delete_at;
-
-    if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
-      try {
-        auto iter = bl.cbegin();
-        decode(delete_at, iter);
-      } catch (buffer::error& err) {
-        ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
-       return -EIO;
-      }
-
-      if (params.expiration_time != delete_at) {
-        return -ERR_PRECONDITION_FAILED;
-      }
-    } else {
-      return -ERR_PRECONDITION_FAILED;
-    }
-  }
-
-  if (!state->exists) {
-    target->invalidate_state();
-    return -ENOENT;
-  }
-
-  r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
-  if (r < 0)
-    return r;
-
-  RGWBucketInfo& bucket_info = target->get_bucket_info();
-
-  RGWRados::Bucket bop(store, bucket_info);
-  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
-  
-  index_op.set_zones_trace(params.zones_trace);
-  index_op.set_bilog_flags(params.bilog_flags);
-
-  r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
-  if (r < 0)
-    return r;
-
-  store->remove_rgw_head_obj(op);
-
-  auto& ioctx = ref.pool.ioctx();
-  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
-
-  /* raced with another operation, object state is indeterminate */
-  const bool need_invalidate = (r == -ECANCELED);
-
-  int64_t poolid = ioctx.get_id();
-  if (r >= 0) {
-    tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
-    if (obj_tombstone_cache) {
-      tombstone_entry entry{*state};
-      obj_tombstone_cache->add(obj, entry);
-    }
-    r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
-    
-    int ret = target->complete_atomic_modification(dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
-    }
-    /* other than that, no need to propagate error */
-  } else {
-    int ret = index_op.cancel(dpp, params.remove_objs);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
-    }
-  }
-
-  if (need_invalidate) {
-    target->invalidate_state();
-  }
-
-  if (r < 0)
-    return r;
-
-  /* update quota cache */
-  store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
-
-  return 0;
-}
-
-int RGWRados::delete_obj(rgw::sal::Driver* store,
-                        const DoutPrefixProvider *dpp,
-                         const RGWBucketInfo& bucket_info,
-                         const rgw_obj& obj,
-                         int versioning_status, // versioning flags defined in enum RGWBucketFlags
-                         uint16_t bilog_flags,
-                         const real_time& expiration_time,
-                         rgw_zone_set *zones_trace)
-{
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  store->get_bucket(nullptr, bucket_info, &bucket);
-  std::unique_ptr<rgw::sal::Object> object = bucket->get_object(obj.key);
-
-  return delete_obj(dpp, bucket_info, object.get(), versioning_status,
-                   bilog_flags, expiration_time, zones_trace);
-}
-
-int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
-                         const RGWBucketInfo& bucket_info,
-                         rgw::sal::Object* obj,
-                         int versioning_status, // versioning flags defined in enum RGWBucketFlags
-                         uint16_t bilog_flags,
-                         const real_time& expiration_time,
-                         rgw_zone_set *zones_trace)
-{
-  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
-
-  del_op->params.bucket_owner = bucket_info.owner;
-  del_op->params.versioning_status = versioning_status;
-  del_op->params.bilog_flags = bilog_flags;
-  del_op->params.expiration_time = expiration_time;
-  del_op->params.zones_trace = zones_trace;
-
-  return del_op->delete_obj(dpp, null_yield);
-}
-
-int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
-{
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  ObjectWriteOperation op;
-
-  op.remove();
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-  if (r < 0)
-    return r;
-
-  return 0;
-}
-
-int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
-{
-  std::string oid, key;
-  get_obj_bucket_and_oid_loc(obj, oid, key);
-
-  RGWBucketInfo bucket_info;
-  int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  RGWRados::Bucket bop(this, bucket_info);
-  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
-
-  return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
-}
-
-static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Driver* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
-{
-  string tag;
-
-  RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
-  if (mi != manifest.obj_end(dpp)) {
-    if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
-      ++mi;
-    rgw::sal::RadosStore* rstore = dynamic_cast<rgw::sal::RadosStore*>(store);
-    tag = mi.get_location().get_raw_obj(rstore).oid;
-    tag.append("_");
-  }
-
-  unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
-  MD5 hash;
-  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-  hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
-
-  map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
-  if (iter != attrset.end()) {
-    bufferlist& bl = iter->second;
-    hash.Update((const unsigned char *)bl.c_str(), bl.length());
-  }
-
-  hash.Final(md5);
-  buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
-  tag.append(md5_str);
-
-  ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
-
-  tag_bl.append(tag.c_str(), tag.size() + 1);
-}
-
-static bool is_olh(map<string, bufferlist>& attrs)
-{
-  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
-  return (iter != attrs.end());
-}
-
-static bool has_olh_tag(map<string, bufferlist>& attrs)
-{
-  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
-  return (iter != attrs.end());
-}
-
-int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx&
-                                  obj_ctx, RGWBucketInfo& bucket_info,
-                                  rgw::sal::Object* obj, RGWObjState *olh_state,
-                                  RGWObjState **target_state,
-                                  RGWObjManifest **target_manifest, optional_yield y)
-{
-  ceph_assert(olh_state->is_olh);
-
-  rgw_obj target;
-  int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
-  if (r < 0) {
-    return r;
-  }
-
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  driver->get_bucket(nullptr, bucket_info, &bucket);
-  std::unique_ptr<rgw::sal::Object> target_obj = bucket->get_object(target.key);
-
-  r = get_obj_state(dpp, &obj_ctx, bucket_info, target_obj.get(), target_state,
-                   target_manifest, false, y);
-  if (r < 0) {
-    return r;
-  }
-
-  return 0;
-}
-
-int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
-                                RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
-                                 RGWObjState **state, RGWObjManifest** manifest,
-                                bool follow_olh, optional_yield y, bool assume_noent)
-{
-  if (obj->empty()) {
-    return -EINVAL;
-  }
-
-  bool need_follow_olh = follow_olh && obj->get_obj().key.instance.empty();
-  *manifest = nullptr;
-
-  RGWObjStateManifest *sm = rctx->get_state(obj->get_obj());
-  RGWObjState *s = &(sm->state);
-  ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
-  *state = s;
-  if (sm->manifest) {
-    *manifest = &(*sm->manifest);
-  }
-  if (s->has_attrs) {
-    if (s->is_olh && need_follow_olh) {
-      return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
-    }
-    return 0;
-  }
-
-  s->obj = obj->get_obj();
-
-  rgw_raw_obj raw_obj;
-  obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &raw_obj);
-
-  int r = -ENOENT;
-
-  if (!assume_noent) {
-    r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
-  }
-
-  if (r == -ENOENT) {
-    s->exists = false;
-    s->has_attrs = true;
-    tombstone_entry entry;
-    if (obj_tombstone_cache && obj_tombstone_cache->find(obj->get_obj(), entry)) {
-      s->mtime = entry.mtime;
-      s->zone_short_id = entry.zone_short_id;
-      s->pg_ver = entry.pg_ver;
-      ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
-          << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
-    } else {
-      s->mtime = real_time();
-    }
-    return 0;
-  }
-  if (r < 0)
-    return r;
-
-  s->exists = true;
-  s->has_attrs = true;
-  s->accounted_size = s->size;
-
-  auto iter = s->attrset.find(RGW_ATTR_ETAG);
-  if (iter != s->attrset.end()) {
-    /* get rid of extra null character at the end of the etag, as we used to store it like that */
-    bufferlist& bletag = iter->second;
-    if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
-      bufferlist newbl;
-      bletag.splice(0, bletag.length() - 1, &newbl);
-      bletag = std::move(newbl);
-    }
-  }
-
-  iter = s->attrset.find(RGW_ATTR_COMPRESSION);
-  const bool compressed = (iter != s->attrset.end());
-  if (compressed) {
-    // use uncompressed size for accounted_size
-    try {
-      RGWCompressionInfo info;
-      auto p = iter->second.cbegin();
-      decode(info, p);
-      s->accounted_size = info.orig_size; 
-    } catch (buffer::error&) {
-      ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
-      return -EIO;
-    }
-  }
-
-  iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
-  if (iter != s->attrset.end()) {
-    bufferlist bl = iter->second;
-    bufferlist::iterator it = bl.begin();
-    it.copy(bl.length(), s->shadow_obj);
-    s->shadow_obj[bl.length()] = '\0';
-  }
-  s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
-  auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
-  if (ttiter != s->attrset.end()) {
-    s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
-  }
-
-  bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
-  if (manifest_bl.length()) {
-    auto miter = manifest_bl.cbegin();
-    try {
-      sm->manifest.emplace();
-      decode(*sm->manifest, miter);
-      sm->manifest->set_head(bucket_info.placement_rule, obj->get_obj(), s->size); /* patch manifest to reflect the head we just read, some manifests might be
-                                             broken due to old bugs */
-      s->size = sm->manifest->get_obj_size();
-      if (!compressed)
-        s->accounted_size = s->size;
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
-      return -EIO;
-    }
-    *manifest = &(*sm->manifest);
-    ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl;
-    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
-       sm->manifest->has_explicit_objs()) {
-      RGWObjManifest::obj_iterator mi;
-      for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) {
-        ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(driver) << dendl;
-      }
-    }
-
-    if (!s->obj_tag.length()) {
-      /*
-       * Uh oh, something's wrong, object with manifest should have tag. Let's
-       * create one out of the manifest, would be unique
-       */
-      generate_fake_tag(dpp, driver, s->attrset, *sm->manifest, manifest_bl, s->obj_tag);
-      s->fake_tag = true;
-    }
-  }
-  map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
-  if (aiter != s->attrset.end()) {
-    bufferlist& pg_ver_bl = aiter->second;
-    if (pg_ver_bl.length()) {
-      auto pgbl = pg_ver_bl.cbegin();
-      try {
-        decode(s->pg_ver, pgbl);
-      } catch (buffer::error& err) {
-        ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
-      }
-    }
-  }
-  aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
-  if (aiter != s->attrset.end()) {
-    bufferlist& zone_short_id_bl = aiter->second;
-    if (zone_short_id_bl.length()) {
-      auto zbl = zone_short_id_bl.cbegin();
-      try {
-        decode(s->zone_short_id, zbl);
-      } catch (buffer::error& err) {
-        ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
-      }
-    }
-  }
-  if (s->obj_tag.length()) {
-    ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
-  } else {
-    ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
-  }
-
-  /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
-   * it exist, and not only if is_olh() returns true
-   */
-  iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
-  if (iter != s->attrset.end()) {
-    s->olh_tag = iter->second;
-  }
-
-  if (is_olh(s->attrset)) {
-    s->is_olh = true;
-
-    ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
-
-    if (need_follow_olh) {
-      return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
-    } else if (obj->get_obj().key.have_null_instance() && !sm->manifest) {
-      // read null version, and the head object only have olh info
-      s->exists = false;
-      return -ENOENT;
-    }
-  }
-
-  return 0;
-}
-
-int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
-                            bool follow_olh, optional_yield y, bool assume_noent)
-{
-  int ret;
-
-  do {
-    ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent);
-  } while (ret == -EAGAIN);
-
-  return ret;
-}
-
-int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
-{
-  RGWObjState *astate;
-  int r = get_state(dpp, &astate, pmanifest, true, y);
-  if (r < 0) {
-    return r;
-  }
-
-  return 0;
-}
-
-int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
-{
-  RGWObjState *state;
-  RGWObjManifest *manifest = nullptr;
-  int r = source->get_state(dpp, &state, &manifest, true, y);
-  if (r < 0)
-    return r;
-  if (!state->exists)
-    return -ENOENT;
-  if (!state->get_attr(name, dest))
-    return -ENODATA;
-
-  return 0;
-}
-
-int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
-{
-  rgw::sal::Object* target = source->get_target(); 
-  rgw_obj obj = target->get_obj();
-  RGWRados *store = source->get_store();
-
-  result.obj = obj;
-  if (target->has_attrs()) {
-    state.ret = 0;
-    result.size = target->get_obj_size();
-    result.mtime = ceph::real_clock::to_timespec(target->get_mtime());
-    result.attrs = target->get_attrs();
-    //result.manifest = sm->manifest;
-    return 0;
-  }
-
-  string oid;
-  string loc;
-  get_obj_bucket_and_oid_loc(obj, oid, loc);
-
-  int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
-  if (r < 0) {
-    return r;
-  }
-
-  librados::ObjectReadOperation op;
-  op.stat2(&result.size, &result.mtime, NULL);
-  op.getxattrs(&result.attrs, NULL);
-  state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
-  state.io_ctx.locator_set_key(loc);
-  r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
-  if (r < 0) {
-    ldpp_dout(dpp, 5) << __func__
-                                                  << ": ERROR: aio_operate() returned ret=" << r
-                                                  << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-
-int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
-{
-  if (!state.completion) {
-    return state.ret;
-  }
-
-  state.completion->wait_for_complete();
-  state.ret = state.completion->get_return_value();
-  state.completion->release();
-
-  if (state.ret != 0) {
-    return state.ret;
-  }
-
-  return finish(dpp);
-}
-
-int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
-{
-  map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
-  if (iter != result.attrs.end()) {
-    bufferlist& bl = iter->second;
-    auto biter = bl.cbegin();
-    try {
-      result.manifest.emplace();
-      decode(*result.manifest, biter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest"  << dendl;
-      return -EIO;
-    }
-  }
-
-  return 0;
-}
-
-int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
-                                 RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
-                                 ObjectOperation& op, RGWObjState **pstate,
-                                RGWObjManifest** pmanifest, optional_yield y)
-{
-  int r = obj->get_obj_state(dpp, pstate, y, false);
-  if (r < 0)
-    return r;
-
-  return append_atomic_test(dpp, *pstate, op);
-}
-
-int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
-                                 const RGWObjState* state,
-                                 librados::ObjectOperation& op)
-{
-  if (!state->is_atomic) {
-    ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
-    return 0;
-  }
-
-  if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
-    op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
-  } else {
-    ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
-  }
-  return 0;
-}
-
-int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent)
-{
-  int r = obj->get_obj_state(dpp, pstate, y, follow_olh);
-  if (r < 0) {
-    return r;
-  }
-  *pmanifest = static_cast<rgw::sal::RadosObject*>(obj)->get_manifest();
-
-  return r;
-}
-
-void RGWRados::Object::invalidate_state()
-{
-  obj->invalidate();
-}
-
-int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
-                                                  ObjectWriteOperation& op, bool reset_obj, const string *ptag,
-                                                  const char *if_match, const char *if_nomatch, bool removal_op,
-                                                  bool modify_tail, optional_yield y)
-{
-  int r = get_state(dpp, &state, &manifest, false, y);
-  if (r < 0)
-    return r;
-
-  bool need_guard = ((manifest) || (state->obj_tag.length() != 0) ||
-                     if_match != NULL || if_nomatch != NULL) &&
-                     (!state->fake_tag);
-
-  if (!state->is_atomic) {
-    ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
-
-    if (reset_obj) {
-      op.create(false);
-      store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
-    }
-
-    return 0;
-  }
-
-  if (need_guard) {
-    /* first verify that the object wasn't replaced under */
-    if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
-      op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); 
-      // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
-    }
-
-    if (if_match) {
-      if (strcmp(if_match, "*") == 0) {
-        // test the object is existing
-        if (!state->exists) {
-          return -ERR_PRECONDITION_FAILED;
-        }
-      } else {
-        bufferlist bl;
-        if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
-            strncmp(if_match, bl.c_str(), bl.length()) != 0) {
-          return -ERR_PRECONDITION_FAILED;
-        }
-      }
-    }
-
-    if (if_nomatch) {
-      if (strcmp(if_nomatch, "*") == 0) {
-        // test the object is NOT existing
-        if (state->exists) {
-          return -ERR_PRECONDITION_FAILED;
-        }
-      } else {
-        bufferlist bl;
-        if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
-            strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
-          return -ERR_PRECONDITION_FAILED;
-        }
-      }
-    }
-  }
-
-  if (reset_obj) {
-    if (state->exists) {
-      op.create(false);
-      store->remove_rgw_head_obj(op);
-    } else {
-      op.create(true);
-    }
-  }
-
-  if (removal_op) {
-    /* the object is being removed, no need to update its tag */
-    return 0;
-  }
-
-  if (ptag) {
-    state->write_tag = *ptag;
-  } else {
-    append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
-  }
-  bufferlist bl;
-  bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
-
-  ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
-
-  op.setxattr(RGW_ATTR_ID_TAG, bl);
-  if (modify_tail) {
-    op.setxattr(RGW_ATTR_TAIL_TAG, bl);
-  }
-
-  return 0;
-}
-
-/**
- * Set an attr on an object.
- * bucket: name of the bucket holding the object
- * obj: name of the object to set the attr on
- * name: the attr to set
- * bl: the contents of the attr
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl)
-{
-  map<string, bufferlist> attrs;
-  attrs[name] = bl;
-  return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield);
-}
-
-int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* src_obj,
-                        map<string, bufferlist>& attrs,
-                        map<string, bufferlist>* rmattrs,
-                        optional_yield y)
-{
-  std::unique_ptr<rgw::sal::Object> obj = src_obj->clone();
-  if (obj->get_instance() == "null") {
-    obj->clear_instance();
-  }
-
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  ObjectWriteOperation op;
-  RGWObjState *state = NULL;
-  RGWObjManifest *manifest = nullptr;
-
-  r = append_atomic_test(dpp, bucket_info, obj.get(), op, &state, &manifest, y);
-  if (r < 0)
-    return r;
-
-  // ensure null version object exist
-  if (src_obj->get_instance() == "null" && !manifest) {
-    return -ENOENT;
-  }
-
-  map<string, bufferlist>::iterator iter;
-  if (rmattrs) {
-    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
-      const string& name = iter->first;
-      op.rmxattr(name.c_str());
-    }
-  }
-
-  const rgw_bucket& bucket = obj->get_bucket()->get_key();
-
-  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
-    const string& name = iter->first;
-    bufferlist& bl = iter->second;
-
-    if (!bl.length())
-      continue;
-
-    op.setxattr(name.c_str(), bl);
-
-    if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
-      real_time ts;
-      try {
-        decode(ts, bl);
-
-        rgw_obj_index_key obj_key;
-        obj->get_key().get_index_key(&obj_key);
-
-        obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
-      } catch (buffer::error& err) {
-       ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
-      }
-    }
-  }
-
-  if (!op.size())
-    return 0;
-
-  bufferlist bl;
-  RGWRados::Bucket bop(this, bucket_info);
-  RGWRados::Bucket::UpdateIndex index_op(&bop, obj->get_obj());
-
-  if (state) {
-    string tag;
-    append_rand_alpha(cct, tag, tag, 32);
-    state->write_tag = tag;
-    r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
-
-    if (r < 0)
-      return r;
-
-    bl.append(tag.c_str(), tag.size() + 1);
-    op.setxattr(RGW_ATTR_ID_TAG,  bl);
-  }
-
-
-  real_time mtime = real_clock::now();
-  struct timespec mtime_ts = real_clock::to_timespec(mtime);
-  op.mtime2(&mtime_ts);
-  auto& ioctx = ref.pool.ioctx();
-  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
-  if (state) {
-    if (r >= 0) {
-      bufferlist acl_bl = attrs[RGW_ATTR_ACL];
-      bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
-      bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
-      string etag = rgw_bl_str(etag_bl);
-      string content_type = rgw_bl_str(content_type_bl);
-      string storage_class;
-      auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
-      if (iter != attrs.end()) {
-        storage_class = rgw_bl_str(iter->second);
-      }
-      uint64_t epoch = ioctx.get_last_version();
-      int64_t poolid = ioctx.get_id();
-      r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
-                            mtime, etag, content_type, storage_class, &acl_bl,
-                            RGWObjCategory::Main, NULL);
-    } else {
-      int ret = index_op.cancel(dpp, nullptr);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
-      }
-    }
-  }
-  if (r < 0)
-    return r;
-
-  if (state) {
-    state->obj_tag.swap(bl);
-    if (rmattrs) {
-      for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
-        state->attrset.erase(iter->first);
-      }
-    }
-
-    for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
-      state->attrset[iter->first] = iter->second;
-    }
-
-    auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
-    if (iter != state->attrset.end()) {
-      iter->second = state->obj_tag;
-    }
-  }
-
-  return 0;
-}
-
-int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
-{
-  RGWRados *store = source->get_store();
-  CephContext *cct = store->ctx();
-
-  bufferlist etag;
-
-  map<string, bufferlist>::iterator iter;
-
-  RGWObjState *astate;
-  RGWObjManifest *manifest = nullptr;
-  int r = source->get_state(dpp, &astate, &manifest, true, y);
-  if (r < 0)
-    return r;
-
-  if (!astate->exists) {
-    return -ENOENT;
-  }
-
-  const RGWBucketInfo& bucket_info = source->get_bucket_info();
-
-  state.obj = astate->obj;
-  store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
-
-  state.cur_pool = state.head_obj.pool;
-  state.cur_ioctx = &state.io_ctxs[state.cur_pool];
-
-  r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
-  if (r < 0) {
-    return r;
-  }
-  if (params.target_obj) {
-    *params.target_obj = state.obj;
-  }
-  if (params.attrs) {
-    *params.attrs = astate->attrset;
-    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
-      for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
-        ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
-      }
-    }
-  }
-
-  /* Convert all times go GMT to make them compatible */
-  if (conds.mod_ptr || conds.unmod_ptr) {
-    obj_time_weight src_weight;
-    src_weight.init(astate);
-    src_weight.high_precision = conds.high_precision_time;
-
-    obj_time_weight dest_weight;
-    dest_weight.high_precision = conds.high_precision_time;
-
-    if (conds.mod_ptr && !conds.if_nomatch) {
-      dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
-      ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
-      if (!(dest_weight < src_weight)) {
-        return -ERR_NOT_MODIFIED;
-      }
-    }
-
-    if (conds.unmod_ptr && !conds.if_match) {
-      dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
-      ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
-      if (dest_weight < src_weight) {
-        return -ERR_PRECONDITION_FAILED;
-      }
-    }
-  }
-  if (conds.if_match || conds.if_nomatch) {
-    r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
-    if (r < 0)
-      return r;
-
-    if (conds.if_match) {
-      string if_match_str = rgw_string_unquote(conds.if_match);
-      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
-      if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
-        return -ERR_PRECONDITION_FAILED;
-      }
-    }
-
-    if (conds.if_nomatch) {
-      string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
-      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
-      if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
-        return -ERR_NOT_MODIFIED;
-      }
-    }
-  }
-
-  if (params.obj_size)
-    *params.obj_size = astate->size;
-  if (params.lastmod)
-    *params.lastmod = astate->mtime;
-
-  return 0;
-}
-
-int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
-{
-  if (ofs < 0) {
-    ofs += obj_size;
-    if (ofs < 0)
-      ofs = 0;
-    end = obj_size - 1;
-  } else if (end < 0) {
-    end = obj_size - 1;
-  }
-
-  if (obj_size > 0) {
-    if (ofs >= (off_t)obj_size) {
-      return -ERANGE;
-    }
-    if (end >= (off_t)obj_size) {
-      end = obj_size - 1;
-    }
-  }
-  return 0;
-}
-
-int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call)
-{
-  RGWRados *store = target->get_store();
-  BucketShard *bs = nullptr;
-  int r;
-
-#define NUM_RESHARD_RETRIES 10
-  for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
-    int ret = get_bucket_shard(&bs, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << 
-        obj_instance.key << ". ret=" << ret << dendl;
-      return ret;
-    }
-
-    r = call(bs);
-    if (r != -ERR_BUSY_RESHARDING) {
-      break;
-    }
-
-    ldpp_dout(dpp, 10) <<
-      "NOTICE: resharding operation on bucket index detected, blocking. obj=" << 
-      obj_instance.key << dendl;
-
-    r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
-    if (r == -ERR_BUSY_RESHARDING) {
-      ldpp_dout(dpp, 10) << __func__ <<
-       " NOTICE: block_while_resharding() still busy. obj=" <<
-        obj_instance.key << dendl;
-      continue;
-    } else if (r < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR: block_while_resharding() failed. obj=" <<
-        obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
-    i = 0; /* resharding is finished, make sure we can retry */
-    invalidate_bs();
-  } // for loop
-
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
-      obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  if (pbs) {
-    *pbs = bs;
-  }
-
-  return 0;
-}
-
-int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
-{
-  if (blind) {
-    return 0;
-  }
-  RGWRados *store = target->get_store();
-
-  if (write_tag && write_tag->length()) {
-    optag = string(write_tag->c_str(), write_tag->length());
-  } else {
-    if (optag.empty()) {
-      append_rand_alpha(store->ctx(), optag, optag, 32);
-    }
-  }
-
-  int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int {
-                                  return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
-                                });
-
-  if (r < 0) {
-    return r;
-  }
-  prepared = true;
-
-  return 0;
-}
-
-int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
-                                            uint64_t size, uint64_t accounted_size,
-                                            ceph::real_time& ut, const string& etag,
-                                            const string& content_type, const string& storage_class,
-                                            bufferlist *acl_bl,
-                                            RGWObjCategory category,
-                                            list<rgw_obj_index_key> *remove_objs, const string *user_data,
-                                            bool appendable)
-{
-  if (blind) {
-    return 0;
-  }
-  RGWRados *store = target->get_store();
-  BucketShard *bs = nullptr;
-
-  int ret = get_bucket_shard(&bs, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
-    return ret;
-  }
-
-  rgw_bucket_dir_entry ent;
-  obj.key.get_index_key(&ent.key);
-  ent.meta.size = size;
-  ent.meta.accounted_size = accounted_size;
-  ent.meta.mtime = ut;
-  ent.meta.etag = etag;
-  ent.meta.storage_class = storage_class;
-  if (user_data)
-    ent.meta.user_data = *user_data;
-
-  ACLOwner owner;
-  if (acl_bl && acl_bl->length()) {
-    int ret = store->decode_policy(dpp, *acl_bl, &owner);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
-    }
-  }
-  ent.meta.owner = owner.get_id().to_str();
-  ent.meta.owner_display_name = owner.get_display_name();
-  ent.meta.content_type = content_type;
-  ent.meta.appendable = appendable;
-
-  ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
-
-  add_datalog_entry(dpp, store->svc.datalog_rados,
-                    target->bucket_info, bs->shard_id);
-
-  return ret;
-}
-
-int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
-                                                int64_t poolid, uint64_t epoch,
-                                                real_time& removed_mtime,
-                                                list<rgw_obj_index_key> *remove_objs)
-{
-  if (blind) {
-    return 0;
-  }
-  RGWRados *store = target->get_store();
-  BucketShard *bs = nullptr;
-
-  int ret = get_bucket_shard(&bs, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
-    return ret;
-  }
-
-  ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
-
-  add_datalog_entry(dpp, store->svc.datalog_rados,
-                    target->bucket_info, bs->shard_id);
-
-  return ret;
-}
-
-
-int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
-                                          list<rgw_obj_index_key> *remove_objs)
-{
-  if (blind) {
-    return 0;
-  }
-  RGWRados *store = target->get_store();
-  BucketShard *bs;
-
-  int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int {
-                                return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
-                              });
-
-  /*
-   * need to update data log anyhow, so that whoever follows needs to update its internal markers
-   * for following the specific bucket shard log. Otherwise they end up staying behind, and users
-   * have no way to tell that they're all caught up
-   */
-  add_datalog_entry(dpp, store->svc.datalog_rados,
-                    target->bucket_info, bs->shard_id);
-
-  return ret;
-}
-
-/*
- * Read up through index `end` inclusive. Number of bytes read is up
- * to `end - ofs + 1`.
- */
-int RGWRados::Object::Read::read(int64_t ofs, int64_t end,
-                                bufferlist& bl, optional_yield y,
-                                const DoutPrefixProvider *dpp)
-{
-  RGWRados *store = source->get_store();
-
-  rgw_raw_obj read_obj;
-  uint64_t read_ofs = ofs;
-  uint64_t len, read_len;
-  bool reading_from_head = true;
-  ObjectReadOperation op;
-
-  bool merge_bl = false;
-  bufferlist *pbl = &bl;
-  bufferlist read_bl;
-  uint64_t max_chunk_size;
-
-  RGWObjState *astate;
-  RGWObjManifest *manifest = nullptr;
-  int r = source->get_state(dpp, &astate, &manifest, true, y);
-  if (r < 0)
-    return r;
-
-  if (astate->size == 0) {
-    end = 0;
-  } else if (end >= (int64_t)astate->size) {
-    end = astate->size - 1;
-  }
-
-  if (end < 0)
-    len = 0;
-  else
-    len = end - ofs + 1;
-
-  if (manifest && manifest->has_tail()) {
-    /* now get the relevant object part */
-    RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
-
-    uint64_t stripe_ofs = iter.get_stripe_ofs();
-    read_obj = iter.get_location().get_raw_obj(store->driver);
-    len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
-    read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
-    reading_from_head = (read_obj == state.head_obj);
-  } else {
-    read_obj = state.head_obj;
-  }
-
-  r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
-    return r;
-  }
-
-  if (len > max_chunk_size)
-    len = max_chunk_size;
-
-
-  read_len = len;
-
-  if (reading_from_head) {
-    /* only when reading from the head object do we need to do the atomic test */
-    std::unique_ptr<rgw::sal::Object> obj = source->bucket->get_object(state.obj.key);
-    r = store->append_atomic_test(dpp, source->get_bucket_info(), obj.get(), op, &astate, &manifest, y);
-    if (r < 0)
-      return r;
-
-    if (astate && astate->prefetch_data) {
-      if (!ofs && astate->data.length() >= len) {
-        bl = astate->data;
-        return bl.length();
-      }
-
-      if (ofs < astate->data.length()) {
-        unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
-        astate->data.begin(ofs).copy(copy_len, bl);
-        read_len -= copy_len;
-        read_ofs += copy_len;
-        if (!read_len)
-         return bl.length();
-
-        merge_bl = true;
-        pbl = &read_bl;
-      }
-    }
-  }
-
-  ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
-  op.read(read_ofs, read_len, pbl, NULL);
-
-  if (state.cur_pool != read_obj.pool) {
-    auto iter = state.io_ctxs.find(read_obj.pool);
-    if (iter == state.io_ctxs.end()) {
-      state.cur_ioctx = &state.io_ctxs[read_obj.pool];
-      r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
-      if (r < 0) {
-        ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
-        return r;
-      }
-    } else {
-      state.cur_ioctx = &iter->second;
-    }
-    state.cur_pool = read_obj.pool;
-  }
-
-  state.cur_ioctx->locator_set_key(read_obj.loc);
-
-  r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
-  ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
-
-  if (r < 0) {
-    return r;
-  }
-
-  if (merge_bl) {
-    bl.append(read_bl);
-  }
-
-  return bl.length();
-}
-
-int get_obj_data::flush(rgw::AioResultList&& results) {
-  int r = rgw::check_for_errors(results);
-  if (r < 0) {
-    return r;
-  }
-  std::list<bufferlist> bl_list;
-
-  auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
-  results.sort(cmp); // merge() requires results to be sorted first
-  completed.merge(results, cmp); // merge results in sorted order
-
-  while (!completed.empty() && completed.front().id == offset) {
-    auto bl = std::move(completed.front().data);
-
-    bl_list.push_back(bl);
-    offset += bl.length();
-    int r = client_cb->handle_data(bl, 0, bl.length());
-    if (r < 0) {
-      return r;
-    }
-
-    if (rgwrados->get_use_datacache()) {
-      const std::lock_guard l(d3n_get_data.d3n_lock);
-      auto oid = completed.front().obj.get_ref().obj.oid;
-      if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
-        lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
-        rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
-      } else {
-        lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
-      }
-    }
-    completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
-  }
-  return 0;
-}
-
-static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
-                               const rgw_raw_obj& read_obj, off_t obj_ofs,
-                               off_t read_ofs, off_t len, bool is_head_obj,
-                               RGWObjState *astate, void *arg)
-{
-  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
-  return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
-                                      is_head_obj, astate, arg);
-}
-
-int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
-                                 const rgw_raw_obj& read_obj, off_t obj_ofs,
-                                 off_t read_ofs, off_t len, bool is_head_obj,
-                                 RGWObjState *astate, void *arg)
-{
-  ObjectReadOperation op;
-  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
-  string oid, key;
-
-  if (is_head_obj) {
-    /* only when reading from the head object do we need to do the atomic test */
-    int r = append_atomic_test(dpp, astate, op);
-    if (r < 0)
-      return r;
-
-    if (astate &&
-        obj_ofs < astate->data.length()) {
-      unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
-
-      r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
-      if (r < 0)
-        return r;
-
-      len -= chunk_len;
-      d->offset += chunk_len;
-      read_ofs += chunk_len;
-      obj_ofs += chunk_len;
-      if (!len)
-         return 0;
-    }
-  }
-
-  auto obj = d->rgwrados->svc.rados->obj(read_obj);
-  int r = obj.open(dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
-    return r;
-  }
-
-  ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
-  op.read(read_ofs, len, nullptr, nullptr);
-
-  const uint64_t cost = len;
-  const uint64_t id = obj_ofs; // use logical object offset for sorting replies
-
-  auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
-
-  return d->flush(std::move(completed));
-}
-
-int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
-                                    optional_yield y)
-{
-  RGWRados *store = source->get_store();
-  CephContext *cct = store->ctx();
-  const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
-  const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
-
-  auto aio = rgw::make_throttle(window_size, y);
-  get_obj_data data(store, cb, &*aio, ofs, y);
-
-  int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(),
-                            source->get_target(),
-                             ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
-    data.cancel(); // drain completions without writing back to client
-    return r;
-  }
-
-  return data.drain();
-}
-
-int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
-                          RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
-                          off_t ofs, off_t end, uint64_t max_chunk_size,
-                          iterate_obj_cb cb, void *arg, optional_yield y)
-{
-  rgw_raw_obj head_obj;
-  rgw_raw_obj read_obj;
-  uint64_t read_ofs = ofs;
-  uint64_t len;
-  bool reading_from_head = true;
-  RGWObjState *astate = NULL;
-  RGWObjManifest *manifest = nullptr;
-
-  obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &head_obj);
-
-  int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y);
-  if (r < 0) {
-    return r;
-  }
-
-  if (end < 0)
-    len = 0;
-  else
-    len = end - ofs + 1;
-
-  if (manifest) {
-    /* now get the relevant object stripe */
-    RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
-
-    RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp);
-
-    for (; iter != obj_end && ofs <= end; ++iter) {
-      off_t stripe_ofs = iter.get_stripe_ofs();
-      off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
-
-      while (ofs < next_stripe_ofs && ofs <= end) {
-        read_obj = iter.get_location().get_raw_obj(driver);
-        uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
-        read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
-
-        if (read_len > max_chunk_size) {
-          read_len = max_chunk_size;
-        }
-
-        reading_from_head = (read_obj == head_obj);
-        r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
-       if (r < 0) {
-         return r;
-        }
-
-       len -= read_len;
-        ofs += read_len;
-      }
-    }
-  } else {
-    while (ofs <= end) {
-      read_obj = head_obj;
-      uint64_t read_len = std::min(len, max_chunk_size);
-
-      r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
-      if (r < 0) {
-       return r;
-      }
-
-      len -= read_len;
-      ofs += read_len;
-    }
-  }
-
-  return 0;
-}
-
-int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
-}
-
-int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  bufferlist outbl;
-
-  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
-}
-
-int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
-{
-  ObjectWriteOperation op;
-
-  ceph_assert(olh_obj.key.instance.empty());
-
-  bool has_tag = (state.exists && has_olh_tag(state.attrset));
-
-  if (!state.exists) {
-    op.create(true);
-  } else {
-    op.assert_exists();
-    struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
-    op.mtime2(&mtime_ts);
-  }
-
-  /*
-   * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
-   * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
-   * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
-   * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
-   * log will reflect that.
-   *
-   * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
-   * is used for object data instance, olh_tag for olh instance.
-   */
-  if (has_tag) {
-    /* guard against racing writes */
-    bucket_index_guard_olh_op(dpp, state, op);
-  }
-
-  if (!has_tag) {
-    /* obj tag */
-    string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
-
-    bufferlist bl;
-    bl.append(obj_tag.c_str(), obj_tag.size());
-    op.setxattr(RGW_ATTR_ID_TAG, bl);
-
-    state.attrset[RGW_ATTR_ID_TAG] = bl;
-    state.obj_tag = bl;
-
-    /* olh tag */
-    string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
-
-    bufferlist olh_bl;
-    olh_bl.append(olh_tag.c_str(), olh_tag.size());
-    op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
-
-    state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
-    state.olh_tag = olh_bl;
-    state.is_olh = true;
-
-    bufferlist verbl;
-    op.setxattr(RGW_ATTR_OLH_VER, verbl);
-  }
-
-  bufferlist bl;
-  RGWOLHPendingInfo pending_info;
-  pending_info.time = real_clock::now();
-  encode(pending_info, bl);
-
-#define OLH_PENDING_TAG_LEN 32
-  /* tag will start with current time epoch, this so that entries are sorted by time */
-  char buf[32];
-  utime_t ut(pending_info.time);
-  snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
-  *op_tag = buf;
-
-  string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
-
-  op_tag->append(s);
-
-  string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
-  attr_name.append(*op_tag);
-
-  op.setxattr(attr_name.c_str(), bl);
-
-  int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
-  if (ret < 0) {
-    return ret;
-  }
-
-  state.exists = true;
-  state.attrset[attr_name] = bl;
-
-  return 0;
-}
-
-int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
-{
-  int ret;
-
-  ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
-  if (ret == -EEXIST) {
-    ret = -ECANCELED;
-  }
-
-  return ret;
-}
-
-int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
-                            BucketShard *bs,
-                           const rgw_obj& obj_instance,
-                           RGWBucketInfo& bucket_info,
-                           std::function<int(BucketShard *)> call)
-{
-  rgw_obj obj;
-  const rgw_obj *pobj = &obj_instance;
-  int r;
-
-  for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
-    r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
-      return r;
-    }
-
-    r = call(bs);
-    if (r != -ERR_BUSY_RESHARDING) {
-      break;
-    }
-
-    ldpp_dout(dpp, 10) <<
-      "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
-      obj_instance.key << dendl;
-
-    r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp);
-    if (r == -ERR_BUSY_RESHARDING) {
-      ldpp_dout(dpp, 10) << __func__ <<
-       " NOTICE: block_while_resharding() still busy. obj=" <<
-        obj_instance.key << dendl;
-      continue;
-    } else if (r < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR: block_while_resharding() failed. obj=" <<
-        obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
-    i = 0; /* resharding is finished, make sure we can retry */
-  } // for loop
-
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
-      obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-
-int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
-                                     const rgw_obj& obj_instance,
-                                     RGWBucketInfo& bucket_info,
-                                     optional_yield y,
-                                     const DoutPrefixProvider *dpp)
-{
-  int ret = 0;
-  cls_rgw_bucket_instance_entry entry;
-
-  // gets loaded by fetch_new_bucket_info; can be used by
-  // clear_resharding
-  std::map<std::string, bufferlist> bucket_attrs;
-
-  // since we want to run this recovery code from two distinct places,
-  // let's just put it in a lambda so we can easily re-use; if the
-  // lambda successfully fetches a new bucket id, it sets
-  // new_bucket_id and returns 0, otherwise it returns a negative
-  // error code
-  auto fetch_new_bucket_info =
-    [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int {
-    int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name,
-                             bucket_info, nullptr, y, dpp, &bucket_attrs);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR: failed to refresh bucket info after reshard at " <<
-       log_tag << ": " << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    ret = bs->init(dpp, bucket_info, obj_instance);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR: failed to refresh bucket shard generation after reshard at " <<
-       log_tag << ": " << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen;
-    ldpp_dout(dpp, 20) << __func__ <<
-      " INFO: refreshed bucket info after reshard at " <<
-      log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl;
-
-    return 0;
-  }; // lambda fetch_new_bucket_info
-
-  constexpr int num_retries = 10;
-  for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
-    auto& ref = bs->bucket_obj.get_ref();
-    ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
-    if (ret == -ENOENT) {
-      ret = fetch_new_bucket_info("get_bucket_resharding_failed");
-      if (ret < 0) {
-       ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-         " failed to refresh bucket info after reshard when get bucket "
-         "resharding failed, error: " << cpp_strerror(-ret) << dendl;
-       return ret;
-      }
-    } else if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
-       dendl;
-      return ret;
-    }
-
-    if (!entry.resharding_in_progress()) {
-      ret = fetch_new_bucket_info("get_bucket_resharding_succeeded");
-      if (ret < 0) {
-       ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-         " failed to refresh bucket info after reshard when get bucket "
-         "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
-       return ret;
-      }
-    }
-
-    ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " <<
-      (i < num_retries ? "retrying" : "too many retries") << dendl;
-
-    if (i == num_retries) {
-      break;
-    }
-
-    // If bucket is erroneously marked as resharding (e.g., crash or
-    // other error) then fix it. If we can take the bucket reshard
-    // lock then it means no other resharding should be taking place,
-    // and we're free to clear the flags.
-    {
-      // since we expect to do this rarely, we'll do our work in a
-      // block and erase our work after each try
-
-      RGWObjectCtx obj_ctx(this->driver);
-      const rgw_bucket& b = bs->bucket;
-      std::string bucket_id = b.get_key();
-      RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true);
-      ret = reshard_lock.lock(dpp);
-      if (ret == -ENOENT) {
-       continue;
-      } else if (ret < 0) {
-       ldpp_dout(dpp, 20) << __func__ <<
-         " ERROR: failed to take reshard lock for bucket " <<
-         bucket_id << "; expected if resharding underway" << dendl;
-      } else {
-       ldpp_dout(dpp, 10) << __func__ <<
-         " INFO: was able to take reshard lock for bucket " <<
-         bucket_id << dendl;
-        // the reshard may have finished, so call clear_resharding()
-        // with its current bucket info; ALSO this will load
-        // bucket_attrs for call to clear_resharding below
-        ret = fetch_new_bucket_info("trying_to_clear_resharding");
-        if (ret < 0) {
-         reshard_lock.unlock();
-         ldpp_dout(dpp, 0) << __func__ <<
-           " ERROR: failed to update bucket info before clear resharding for bucket " <<
-           bucket_id << dendl;
-          continue; // try again
-        }
-
-       ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp);
-       reshard_lock.unlock();
-       if (ret == -ENOENT) {
-         ldpp_dout(dpp, 5) << __func__ <<
-           " INFO: no need to reset reshard flags; old shards apparently"
-           " removed after successful resharding of bucket " <<
-           bucket_id << dendl;
-         continue; // immediately test again
-       } else if (ret < 0) {
-         ldpp_dout(dpp, 0) << __func__ <<
-           " ERROR: failed to clear resharding flags for bucket " <<
-           bucket_id << ", " << cpp_strerror(-ret) << dendl;
-         // wait and then test again
-       } else {
-         ldpp_dout(dpp, 5) << __func__ <<
-           " INFO: apparently successfully cleared resharding flags for "
-           "bucket " << bucket_id << dendl;
-         continue; // if we apparently succeed immediately test again
-       } // if clear resharding succeeded
-      } // if taking of lock succeeded
-    } // block to encapsulate recovery from incomplete reshard
-
-    ret = reshard_wait->wait(y);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR: bucket is still resharding, please retry" << dendl;
-      return ret;
-    }
-  } // for loop
-
-  ldpp_dout(dpp, 0) << __func__ <<
-    " ERROR: bucket is still resharding, please retry" << dendl;
-  return -ERR_BUSY_RESHARDING;
-}
-
-int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
-                                    RGWObjState& olh_state, const rgw_obj& obj_instance,
-                                    bool delete_marker, const string& op_tag,
-                                    struct rgw_bucket_dir_entry_meta *meta,
-                                    uint64_t olh_epoch,
-                                    real_time unmod_since, bool high_precision_time,
-                                    rgw_zone_set *_zones_trace, bool log_data_change)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  rgw_zone_set zones_trace;
-  if (_zones_trace) {
-    zones_trace = *_zones_trace;
-  }
-  zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
-
-  BucketShard bs(this);
-
-  r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
-                   [&](BucketShard *bs) -> int {
-                     cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
-                     auto& ref = bs->bucket_obj.get_ref();
-                     librados::ObjectWriteOperation op;
-                     op.assert_exists(); // bucket index shard must exist
-                     cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                     cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
-                                              delete_marker, op_tag, meta, olh_epoch,
-                                             unmod_since, high_precision_time,
-                                             svc.zone->get_zone().log_data, zones_trace);
-                      return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-                    });
-  if (r < 0) {
-    ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
-    return r;
-  }
-
-  add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id);
-
-  return 0;
-}
-
-void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
-{
-  ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
-  op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
-}
-
-int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
-                                           RGWBucketInfo& bucket_info,
-                                           const rgw_obj& obj_instance,
-                                           const string& op_tag, const string& olh_tag,
-                                           uint64_t olh_epoch, rgw_zone_set *_zones_trace)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  rgw_zone_set zones_trace;
-  if (_zones_trace) {
-    zones_trace = *_zones_trace;
-  }
-  zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
-
-  BucketShard bs(this);
-
-  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
-  r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
-                   [&](BucketShard *bs) -> int {
-                     auto& ref = bs->bucket_obj.get_ref();
-                     librados::ObjectWriteOperation op;
-                     op.assert_exists(); // bucket index shard must exist
-                     cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                     cls_rgw_bucket_unlink_instance(op, key, op_tag,
-                                                    olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
-                      return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-                    });
-  if (r < 0) {
-    ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
-                                        RGWBucketInfo& bucket_info, RGWObjState& state,
-                                        const rgw_obj& obj_instance, uint64_t ver_marker,
-                                        std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log,
-                                        bool *is_truncated)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  BucketShard bs(this);
-  int ret =
-    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
-
-  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
-
-  auto& shard_ref = bs.bucket_obj.get_ref();
-  ObjectReadOperation op;
-
-  rgw_cls_read_olh_log_ret log_ret;
-  int op_ret = 0;
-  cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret); 
-  bufferlist outbl;
-  r =  rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
-  if (r < 0) {
-    return r;
-  }
-  if (op_ret < 0) {
-    ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
-    return op_ret;
-  }
-
-  *log = std::move(log_ret.log);
-  *is_truncated = log_ret.is_truncated;
-
-  return 0;
-}
-
-// a multisite sync bug resulted in the OLH head attributes being overwritten by
-// the attributes from another zone, causing link_olh() to fail endlessly due to
-// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
-// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
-int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
-                         const rgw_obj& obj)
-{
-  // fetch the current olh entry from the bucket index
-  rgw_bucket_olh_entry olh;
-  int r = bi_get_olh(dpp, bucket_info, obj, &olh);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
-    return r;
-  }
-  if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
-    return 0;
-  }
-
-  ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
-      << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
-
-  // rewrite OLH_ID_TAG and OLH_INFO from current olh
-  ObjectWriteOperation op;
-  // assert this is the same olh tag we think we're fixing
-  bucket_index_guard_olh_op(dpp, *state, op);
-  // preserve existing mtime
-  struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
-  op.mtime2(&mtime_ts);
-  {
-    bufferlist bl;
-    bl.append(olh.tag.c_str(), olh.tag.size());
-    op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
-  }
-  {
-    RGWOLHInfo info;
-    info.target = rgw_obj(bucket_info.bucket, olh.key);
-    info.removed = olh.delete_marker;
-    bufferlist bl;
-    encode(info, bl);
-    op.setxattr(RGW_ATTR_OLH_INFO, bl);
-  }
-  rgw_rados_ref ref;
-  r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
-        << cpp_strerror(r) << dendl;
-    return r;
-  }
-  return 0;
-}
-
-int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp,
-                                        RGWBucketInfo& bucket_info,
-                                        RGWObjState& state,
-                                        const rgw_obj& obj_instance, uint64_t ver)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  BucketShard bs(this);
-  int ret =
-    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
-
-  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
-
-  ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
-                     [&](BucketShard *pbs) -> int {
-                       ObjectWriteOperation op;
-                       op.assert_exists(); // bucket index shard must exist
-                       cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                       cls_rgw_trim_olh_log(op, key, ver, olh_tag);
-                        return pbs->bucket_obj.operate(dpp, &op, null_yield);
-                      });
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp,
-                                     RGWBucketInfo& bucket_info,
-                                     RGWObjState& state,
-                                     const rgw_obj& obj_instance)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  BucketShard bs(this);
-
-  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
-
-  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
-
-  int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
-                         [&](BucketShard *pbs) -> int {
-                           ObjectWriteOperation op;
-                           op.assert_exists(); // bucket index shard must exist
-                           auto& ref = pbs->bucket_obj.get_ref();
-                           cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                           cls_rgw_clear_olh(op, key, olh_tag);
-                            return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-                          });
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
-{
-  try {
-    auto biter = bl.cbegin();
-    decode(*olh, biter);
-    return 0;
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
-    return -EIO;
-  }
-}
-
-int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
-                           RGWObjState& state,
-                           RGWBucketInfo& bucket_info,
-                           const rgw::sal::Object* obj,
-                           bufferlist& olh_tag,
-                           std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
-                           uint64_t *plast_ver,
-                           rgw_zone_set* zones_trace)
-{
-  if (log.empty()) {
-    return 0;
-  }
-
-  librados::ObjectWriteOperation op;
-
-  uint64_t last_ver = log.rbegin()->first;
-  *plast_ver = last_ver;
-
-  map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
-
-  op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
-  op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
-
-  bufferlist ver_bl;
-  string last_ver_s = to_string(last_ver);
-  ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
-  op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
-
-  struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
-  op.mtime2(&mtime_ts);
-
-  bool need_to_link = false;
-  uint64_t link_epoch = 0;
-  cls_rgw_obj_key key;
-  bool delete_marker = false;
-  list<cls_rgw_obj_key> remove_instances;
-  bool need_to_remove = false;
-
-  // decode current epoch and instance
-  auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
-  if (olh_ver != state.attrset.end()) {
-    std::string str = olh_ver->second.to_str();
-    std::string err;
-    link_epoch = strict_strtoll(str.c_str(), 10, &err);
-  }
-  auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
-  if (olh_info != state.attrset.end()) {
-    RGWOLHInfo info;
-    int r = decode_olh_info(dpp, cct, olh_info->second, &info);
-    if (r < 0) {
-      return r;
-    }
-    info.target.key.get_index_key(&key);
-    delete_marker = info.removed;
-  }
-
-  for (iter = log.begin(); iter != log.end(); ++iter) {
-    vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
-    for (; viter != iter->second.end(); ++viter) {
-      rgw_bucket_olh_log_entry& entry = *viter;
-
-      ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
-                     << " key=" << entry.key.name << "[" << entry.key.instance << "] "
-                     << (entry.delete_marker ? "(delete)" : "") << dendl;
-      switch (entry.op) {
-      case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
-        remove_instances.push_back(entry.key);
-        break;
-      case CLS_RGW_OLH_OP_LINK_OLH:
-        // only overwrite a link of the same epoch if its key sorts before
-        if (link_epoch < iter->first || key.instance.empty() ||
-            key.instance > entry.key.instance) {
-          ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
-              << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
-          need_to_link = true;
-          need_to_remove = false;
-          key = entry.key;
-          delete_marker = entry.delete_marker;
-        } else {
-          ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
-              << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
-        }
-        break;
-      case CLS_RGW_OLH_OP_UNLINK_OLH:
-        need_to_remove = true;
-        need_to_link = false;
-        break;
-      default:
-        ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
-        return -EIO;
-      }
-      string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
-      attr_name.append(entry.op_tag);
-      op.rmxattr(attr_name.c_str());
-    }
-  }
-
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  rgw::sal::Bucket* bucket = obj->get_bucket();
-
-  if (need_to_link) {
-    rgw_obj target(bucket->get_key(), key);
-    RGWOLHInfo info;
-    info.target = target;
-    info.removed = delete_marker;
-    bufferlist bl;
-    encode(info, bl);
-    op.setxattr(RGW_ATTR_OLH_INFO, bl);
-  }
-
-  /* first remove object instances */
-  for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
-       liter != remove_instances.end(); ++liter) {
-    cls_rgw_obj_key& key = *liter;
-    std::unique_ptr<rgw::sal::Object> obj_instance = bucket->get_object(key);
-    int ret = delete_obj(dpp, bucket_info, obj_instance.get(), 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
-    if (ret < 0 && ret != -ENOENT) {
-      ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
-      return ret;
-    }
-  }
-
-  /* update olh object */
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
-    return r;
-  }
-
-  r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj->get_obj(), last_ver);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
-    return r;
-  }
-
-  if (need_to_remove) {
-    ObjectWriteOperation rm_op;
-
-    rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
-    rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
-    cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
-    rm_op.remove();
-
-    r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
-    if (r == -ECANCELED) {
-      return 0; /* someone else won this race */
-    } else {
-      /* 
-       * only clear if was successful, otherwise we might clobber pending operations on this object
-       */
-      r = bucket_index_clear_olh(dpp, bucket_info, state, obj->get_obj());
-      if (r < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
-        return r;
-      }
-    }
-  }
-
-  return 0;
-}
-
-/*
- * read olh log and apply it
- */
-int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace)
-{
-  map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
-  bool is_truncated;
-  uint64_t ver_marker = 0;
-
-  do {
-    int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj->get_obj(), ver_marker, &log, &is_truncated);
-    if (ret < 0) {
-      return ret;
-    }
-    ret = apply_olh_log(dpp, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
-    if (ret < 0) {
-      return ret;
-    }
-  } while (is_truncated);
-
-  return 0;
-}
-
-int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
-                     RGWBucketInfo& bucket_info,
-                     rgw::sal::Object* target_obj, bool delete_marker,
-                     rgw_bucket_dir_entry_meta *meta,
-                      uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
-                      optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
-{
-  string op_tag;
-
-  std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
-  olh_obj->clear_instance();
-
-  RGWObjState *state = NULL;
-  RGWObjManifest *manifest = nullptr;
-
-  int ret = 0;
-  int i;
-
-#define MAX_ECANCELED_RETRY 100
-  for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
-    if (ret == -ECANCELED) {
-      olh_obj->invalidate();
-    }
-
-    ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj.get(), &state, &manifest, false, y); /* don't follow olh */
-    if (ret < 0) {
-      return ret;
-    }
-
-    ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
-    if (ret < 0) {
-      ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
-      if (ret == -ECANCELED) {
-        continue;
-      }
-      return ret;
-    }
-    ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj->get_obj(),
-                               delete_marker, op_tag, meta, olh_epoch, unmod_since,
-                               high_precision_time, zones_trace, log_data_change);
-    if (ret < 0) {
-      ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
-      if (ret == -ECANCELED) {
-        // the bucket index rejected the link_olh() due to olh tag mismatch;
-        // attempt to reconstruct olh head attributes based on the bucket index
-        int r2 = repair_olh(dpp, state, bucket_info, olh_obj->get_obj());
-        if (r2 < 0 && r2 != -ECANCELED) {
-          return r2;
-        }
-        continue;
-      }
-      return ret;
-    }
-    break;
-  }
-
-  if (i == MAX_ECANCELED_RETRY) {
-    ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
-    return -EIO;
-  }
-
-  ret = update_olh(dpp, state, bucket_info, olh_obj.get());
-  if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
-    ret = 0;
-  }
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
-                                  uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
-{
-  string op_tag;
-
-  std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
-  olh_obj->clear_instance();
-
-  RGWObjState *state = NULL;
-
-  int ret = 0;
-  int i;
-
-  for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
-    if (ret == -ECANCELED) {
-      olh_obj->invalidate();
-    }
-
-    ret = olh_obj->get_obj_state(dpp, &state, y, false); /* don't follow olh */
-    if (ret < 0)
-      return ret;
-
-    ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
-    if (ret < 0) {
-      ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
-      if (ret == -ECANCELED) {
-        continue;
-      }
-      return ret;
-    }
-
-    string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
-
-    ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj->get_obj(), op_tag, olh_tag, olh_epoch, zones_trace);
-    if (ret < 0) {
-      ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
-      if (ret == -ECANCELED) {
-        continue;
-      }
-      return ret;
-    }
-    break;
-  }
-
-  if (i == MAX_ECANCELED_RETRY) {
-    ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
-    return -EIO;
-  }
-
-  ret = update_olh(dpp, state, bucket_info, olh_obj.get(), zones_trace);
-  if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
-    return 0;
-  }
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
-{
-#define OBJ_INSTANCE_LEN 32
-  char buf[OBJ_INSTANCE_LEN + 1];
-
-  gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
-                                                                      no underscore for instance name due to the way we encode the raw keys */
-
-  target_key->set_instance(buf);
-}
-
-void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
-{
-  gen_rand_obj_instance_name(&target_obj->key);
-}
-
-int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
-{
-  map<string, bufferlist> attrset;
-
-  ObjectReadOperation op;
-  op.getxattrs(&attrset, NULL);
-
-  int r = obj_operate(dpp, bucket_info, obj, &op);
-  if (r < 0) {
-    return r;
-  }
-
-  auto iter = attrset.find(RGW_ATTR_OLH_INFO);
-  if (iter == attrset.end()) { /* not an olh */
-    return -EINVAL;
-  }
-
-  return decode_olh_info(dpp, cct, iter->second, olh);
-}
-
-void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp,
-                                        map<string, bufferlist>& pending_entries,
-                                         map<string, bufferlist> *rm_pending_entries)
-{
-  map<string, bufferlist>::iterator iter = pending_entries.begin();
-
-  real_time now = real_clock::now();
-
-  while (iter != pending_entries.end()) {
-    auto biter = iter->second.cbegin();
-    RGWOLHPendingInfo pending_info;
-    try {
-      decode(pending_info, biter);
-    } catch (buffer::error& err) {
-      /* skipping bad entry, we could remove it but it might hide a bug */
-      ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
-      ++iter;
-      continue;
-    }
-
-    map<string, bufferlist>::iterator cur_iter = iter;
-    ++iter;
-    if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
-      (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
-      pending_entries.erase(cur_iter);
-    } else {
-      /* entries names are sorted by time (rounded to a second) */
-      break;
-    }
-  }
-}
-
-int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
-{
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  // trim no more than 1000 entries per osd op
-  constexpr int max_entries = 1000;
-
-  auto i = pending_attrs.begin();
-  while (i != pending_attrs.end()) {
-    ObjectWriteOperation op;
-    bucket_index_guard_olh_op(dpp, state, op);
-
-    for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
-      op.rmxattr(i->first.c_str());
-    }
-
-    r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-    if (r == -ENOENT || r == -ECANCELED) {
-      /* raced with some other change, shouldn't sweat about it */
-      return 0;
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
-      return r;
-    }
-  }
-  return 0;
-}
-
-int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target)
-{
-  map<string, bufferlist> pending_entries;
-  rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
-
-  map<string, bufferlist> rm_pending_entries;
-  check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
-
-  if (!rm_pending_entries.empty()) {
-    int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj->get_obj(), rm_pending_entries);
-    if (ret < 0) {
-      ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
-      return ret;
-    }
-  }
-  if (!pending_entries.empty()) {
-    ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj->get_bucket() << dendl;
-
-    int ret = update_olh(dpp, state, bucket_info, olh_obj);
-    if (ret < 0) {
-      if (ret == -ECANCELED) {
-        // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object.
-        // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We
-        // return ENOENT to indicate that the OLH object was removed.
-        ret = -ENOENT;
-      }
-      return ret;
-    }
-  }
-
-  auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
-  if (iter == state->attrset.end()) {
-    return -EINVAL;
-  }
-
-  RGWOLHInfo olh;
-  int ret = decode_olh_info(dpp, cct, iter->second, &olh);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (olh.removed) {
-    return -ENOENT;
-  }
-
-  *target = olh.target;
-
-  return 0;
-}
-
-int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
-                           rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
-                           map<string, bufferlist> *attrs, bufferlist *first_chunk,
-                           RGWObjVersionTracker *objv_tracker, optional_yield y)
-{
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  map<string, bufferlist> unfiltered_attrset;
-  uint64_t size = 0;
-  struct timespec mtime_ts;
-
-  ObjectReadOperation op;
-  if (objv_tracker) {
-    objv_tracker->prepare_op_for_read(&op);
-  }
-  if (attrs) {
-    op.getxattrs(&unfiltered_attrset, NULL);
-  }
-  if (psize || pmtime) {
-    op.stat2(&size, &mtime_ts, NULL);
-  }
-  if (first_chunk) {
-    op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
-  }
-  bufferlist outbl;
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y);
-
-  if (epoch) {
-    *epoch = ref.pool.ioctx().get_last_version();
-  }
-
-  if (r < 0)
-    return r;
-
-  if (psize)
-    *psize = size;
-  if (pmtime)
-    *pmtime = ceph::real_clock::from_timespec(mtime_ts);
-  if (attrs) {
-    rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
-  }
-
-  return 0;
-}
-
-int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp,
-                              RGWBucketInfo& bucket_info,
-                              const rgw::bucket_index_layout_generation& idx_layout,
-                              int shard_id, string *bucket_ver, string *master_ver,
-                              map<RGWObjCategory, RGWStorageStats>& stats,
-                              string *max_marker, bool *syncstopped)
-{
-  vector<rgw_bucket_dir_header> headers;
-  map<int, string> bucket_instance_ids;
-  int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids);
-  if (r < 0) {
-    return r;
-  }
-
-  ceph_assert(headers.size() == bucket_instance_ids.size());
-
-  auto iter = headers.begin();
-  map<int, string>::iterator viter = bucket_instance_ids.begin();
-  BucketIndexShardsManager ver_mgr;
-  BucketIndexShardsManager master_ver_mgr;
-  BucketIndexShardsManager marker_mgr;
-  char buf[64];
-  for(; iter != headers.end(); ++iter, ++viter) {
-    accumulate_raw_stats(*iter, stats);
-    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
-    ver_mgr.add(viter->first, string(buf));
-    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
-    master_ver_mgr.add(viter->first, string(buf));
-    if (shard_id >= 0) {
-      *max_marker = iter->max_marker;
-    } else {
-      marker_mgr.add(viter->first, iter->max_marker);
-    }
-    if (syncstopped != NULL)
-      *syncstopped = iter->syncstopped;
-  }
-  ver_mgr.to_string(bucket_ver);
-  master_ver_mgr.to_string(master_ver);
-  if (shard_id < 0) {
-    marker_mgr.to_string(max_marker);
-  }
-  return 0;
-}
-
-class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
-  RGWGetBucketStats_CB *cb;
-  uint32_t pendings;
-  map<RGWObjCategory, RGWStorageStats> stats;
-  int ret_code;
-  bool should_cb;
-  ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
-
-public:
-  RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
-    : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
-  {}
-
-  void handle_response(int r, rgw_bucket_dir_header& header) override {
-    std::lock_guard l{lock};
-    if (should_cb) {
-      if ( r >= 0) {
-        accumulate_raw_stats(header, stats);
-      } else {
-        ret_code = r;
-      }
-
-      // Are we all done?
-      if (--pendings == 0) {
-        if (!ret_code) {
-          cb->set_response(&stats);
-        }
-        cb->handle_response(ret_code);
-        cb->put();
-      }
-    }
-  }
-
-  void unset_cb() {
-    std::lock_guard l{lock};
-    should_cb = false;
-  }
-};
-
-int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
-{
-  int num_aio = 0;
-  RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
-  ceph_assert(get_ctx);
-  int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio);
-  if (r < 0) {
-    ctx->put();
-    if (num_aio) {
-      get_ctx->unset_cb();
-    }
-  }
-  get_ctx->put();
-  return r;
-}
-
-int RGWRados::get_bucket_instance_info(const string& meta_key,
-                                      RGWBucketInfo& info,
-                                       real_time *pmtime,
-                                      map<string, bufferlist> *pattrs,
-                                      optional_yield y,
-                                       const DoutPrefixProvider *dpp)
-{
-  rgw_bucket bucket;
-  rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
-
-  return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp);
-}
-
-int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info,
-                                       real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
-                                       const DoutPrefixProvider *dpp)
-{
-  return ctl.bucket->read_bucket_instance_info(bucket, &info,
-                                              y,
-                                               dpp,
-                                              RGWBucketCtl::BucketInstance::GetParams()
-                                              .set_mtime(pmtime)
-                                              .set_attrs(pattrs));
-}
-
-int RGWRados::get_bucket_info(RGWServices *svc,
-                              const string& tenant, const string& bucket_name,
-                              RGWBucketInfo& info,
-                              real_time *pmtime,
-                              optional_yield y,
-                              const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
-{
-  rgw_bucket bucket;
-  bucket.tenant = tenant;
-  bucket.name = bucket_name;
-  return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
-                                     RGWBucketCtl::BucketInstance::GetParams()
-                                     .set_mtime(pmtime)
-                                     .set_attrs(pattrs));
-}
-
-int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
-                                      ceph::real_time *pmtime,
-                                      const DoutPrefixProvider *dpp,
-                                      map<string, bufferlist> *pattrs)
-{
-  rgw_bucket bucket = info.bucket;
-  bucket.bucket_id.clear();
-
-  auto rv = info.objv_tracker.read_version;
-
-  return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
-                                     RGWBucketCtl::BucketInstance::GetParams()
-                                     .set_mtime(pmtime)
-                                     .set_attrs(pattrs)
-                                     .set_refresh_version(rv));
-}
-
-int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
-                              real_time mtime, map<string, bufferlist> *pattrs,
-                              const DoutPrefixProvider *dpp)
-{
-  return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
-                                               RGWBucketCtl::BucketInstance::PutParams()
-                                               .set_exclusive(exclusive)
-                                               .set_mtime(mtime)
-                                               .set_attrs(pattrs));
-}
-
-int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
-                                     map<string, bufferlist> *pattrs, bool create_entry_point,
-                                     const DoutPrefixProvider *dpp)
-{
-  bool create_head = !info.has_instance_obj || create_entry_point;
-
-  int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (!create_head)
-    return 0; /* done! */
-
-  RGWBucketEntryPoint entry_point;
-  entry_point.bucket = info.bucket;
-  entry_point.owner = info.owner;
-  entry_point.creation_time = info.creation_time;
-  entry_point.linked = true;
-  RGWObjVersionTracker ot;
-  if (pep_objv && !pep_objv->tag.empty()) {
-    ot.write_version = *pep_objv;
-  } else {
-    ot.generate_new_write_ver(cct);
-    if (pep_objv) {
-      *pep_objv = ot.write_version;
-    }
-  }
-  ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
-                                                                         .set_exclusive(exclusive)
-                                                                         .set_objv_tracker(&ot)
-                                                                         .set_mtime(mtime));
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
-{
-  map<string, RGWBucketEnt>::iterator iter;
-  for (iter = m.begin(); iter != m.end(); ++iter) {
-    RGWBucketEnt& ent = iter->second;
-    rgw_bucket& bucket = ent.bucket;
-    ent.count = 0;
-    ent.size = 0;
-    ent.size_rounded = 0;
-
-    vector<rgw_bucket_dir_header> headers;
-
-    RGWBucketInfo bucket_info;
-    int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp);
-    if (ret < 0) {
-      return ret;
-    }
-
-    int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers);
-    if (r < 0)
-      return r;
-
-    auto hiter = headers.begin();
-    for (; hiter != headers.end(); ++hiter) {
-      RGWObjCategory category = main_category;
-      auto iter = (hiter->stats).find(category);
-      if (iter != hiter->stats.end()) {
-        struct rgw_bucket_category_stats& stats = iter->second;
-        ent.count += stats.num_entries;
-        ent.size += stats.total_size;
-        ent.size_rounded += stats.total_size_rounded;
-      }
-    }
-
-    // fill in placement_rule from the bucket instance for use in swift's
-    // per-storage policy statistics
-    ent.placement_rule = std::move(bucket_info.placement_rule);
-  }
-
-  return m.size();
-}
-
-int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
-{
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-  librados::Rados *rad = get_rados_handle();
-  librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
-
-  r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
-  completion->release();
-  return r;
-}
-
-int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
-{
-  librados::IoCtx& io_ctx = ctx.io_ctx;
-  librados::NObjectIterator& iter = ctx.iter;
-
-  int r = open_pool_ctx(dpp, pool, io_ctx, false);
-  if (r < 0)
-    return r;
-
-  iter = io_ctx.nobjects_begin();
-
-  return 0;
-}
-
-int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
-{
-  librados::IoCtx& io_ctx = ctx.io_ctx;
-  librados::NObjectIterator& iter = ctx.iter;
-
-  int r = open_pool_ctx(dpp, pool, io_ctx, false);
-  if (r < 0)
-    return r;
-
-  librados::ObjectCursor oc;
-  if (!oc.from_str(cursor)) {
-    ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
-    return -EINVAL;
-  }
-
-  try {
-    iter = io_ctx.nobjects_begin(oc);
-    return 0;
-  } catch (const std::system_error& e) {
-    r = -e.code().value();
-    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
-       << ", returning " << r << dendl;
-    return r;
-  } catch (const std::exception& e) {
-    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
-       << ", returning -5" << dendl;
-    return -EIO;
-  }
-}
-
-string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
-{
-  return ctx.iter.get_cursor().to_str();
-}
-
-static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
-                           vector<rgw_bucket_dir_entry>& objs,
-                           bool *is_truncated, RGWAccessListFilter *filter)
-{
-  librados::IoCtx& io_ctx = ctx.io_ctx;
-  librados::NObjectIterator& iter = ctx.iter;
-
-  if (iter == io_ctx.nobjects_end())
-    return -ENOENT;
-
-  uint32_t i;
-
-  for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
-    rgw_bucket_dir_entry e;
-
-    string oid = iter->get_oid();
-    ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
-
-    // fill it in with initial values; we may correct later
-    if (filter && !filter->filter(oid, oid))
-      continue;
-
-    e.key = oid;
-    objs.push_back(e);
-  }
-
-  if (is_truncated)
-    *is_truncated = (iter != io_ctx.nobjects_end());
-
-  return objs.size();
-}
-
-int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
-                           bool *is_truncated, RGWAccessListFilter *filter)
-{
-  // catch exceptions from NObjectIterator::operator++()
-  try {
-    return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
-  } catch (const std::system_error& e) {
-    int r = -e.code().value();
-    ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
-       << ", returning " << r << dendl;
-    return r;
-  } catch (const std::exception& e) {
-    ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
-       << ", returning -5" << dendl;
-    return -EIO;
-  }
-}
-
-int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
-{
-  if (!ctx->initialized) {
-    int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
-    if (r < 0) {
-      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
-      return r;
-    }
-    ctx->initialized = true;
-  }
-  return 0;
-}
-
-int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
-                                    RGWListRawObjsCtx& ctx, list<string>& oids,
-                                    bool *is_truncated)
-{
-  if (!ctx.initialized) {
-    return -EINVAL;
-  }
-  RGWAccessListFilterPrefix filter(prefix_filter);
-  vector<rgw_bucket_dir_entry> objs;
-  int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
-  if (r < 0) {
-    if(r != -ENOENT)
-      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
-    return r;
-  }
-
-  vector<rgw_bucket_dir_entry>::iterator iter;
-  for (iter = objs.begin(); iter != objs.end(); ++iter) {
-    oids.push_back(iter->key.name);
-  }
-
-  return oids.size();
-}
-
-int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
-                              int max, RGWListRawObjsCtx& ctx, list<string>& oids,
-                              bool *is_truncated)
-{
-  if (!ctx.initialized) {
-    int r = list_raw_objects_init(dpp, pool, string(), &ctx);
-    if (r < 0) {
-      return r;
-    }
-  }
-
-  return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
-}
-
-string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
-{
-  return pool_iterate_get_cursor(ctx.iter_ctx);
-}
-
-int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-                              rgw_bucket_dir_entry *dirent)
-{
-  rgw_cls_bi_entry bi_entry;
-  int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
-  if (r < 0 && r != -ENOENT) {
-    ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
-  }
-  if (r < 0) {
-    return r;
-  }
-  auto iter = bi_entry.data.cbegin();
-  try {
-    decode(*dirent, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
-    return -EIO;
-  }
-
-  return 0;
-}
-
-int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-                         rgw_bucket_olh_entry *olh)
-{
-  rgw_cls_bi_entry bi_entry;
-  int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
-  if (r < 0 && r != -ENOENT) {
-    ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
-  }
-  if (r < 0) {
-    return r;
-  }
-  auto iter = bi_entry.data.cbegin();
-  try {
-    decode(*olh, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
-    return -EIO;
-  }
-
-  return 0;
-}
-
-int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-                     BIIndexType index_type, rgw_cls_bi_entry *entry)
-{
-  BucketShard bs(this);
-  int ret = bs.init(dpp, bucket_info, obj);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
-
-  auto& ref = bs.bucket_obj.get_ref();
-  
-  return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
-}
-
-void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
-{
-  auto& ref = bs.bucket_obj.get_ref();
-  cls_rgw_bi_put(op, ref.obj.oid, entry);
-}
-
-int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
-{
-  auto& ref = bs.bucket_obj.get_ref();
-  int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
-{
-  // make sure incomplete multipart uploads are hashed correctly
-  if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
-    RGWMPObj mp;
-    mp.from_meta(obj.key.name);
-    obj.index_hash_source = mp.get_key();
-  }
-  BucketShard bs(this);
-
-  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  return bi_put(bs, entry);
-}
-
-int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
-                     const string& obj_name_filter, const string& marker, uint32_t max,
-                     list<rgw_cls_bi_entry> *entries, bool *is_truncated)
-{
-  rgw_obj obj(bucket, obj_name_filter);
-  BucketShard bs(this);
-  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  auto& ref = bs.bucket_obj.get_ref();
-  ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
-  if (ret == -ENOENT) {
-    *is_truncated = false;
-  }
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
-                     list<rgw_cls_bi_entry> *entries, bool *is_truncated)
-{
-  auto& ref = bs.bucket_obj.get_ref();
-  int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWRados::bi_list(const DoutPrefixProvider *dpp,
-                     const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
-                     list<rgw_cls_bi_entry> *entries, bool *is_truncated)
-{
-  BucketShard bs(this);
-  int ret = bs.init(dpp, bucket_info,
-                   bucket_info.layout.current_index,
-                   shard_id);
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
-}
-
-int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
-{
-  auto& ref = bs.bucket_obj.get_ref();
-  int ret = ref.pool.ioctx().remove(ref.obj.oid);
-  if (ret == -ENOENT) {
-    ret = 0;
-  }
-  if (ret < 0) {
-    ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
-{
-  return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
-}
-
-int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
-                             librados::ObjectWriteOperation *op)
-{
-  return gc_pool_ctx.aio_operate(oid, c, op);
-}
-
-int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
-{
-  return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
-}
-
-int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
-{
-  return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
-}
-
-int RGWRados::process_gc(bool expired_only)
-{
-  return gc->process(expired_only);
-}
-
-int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
-                              vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
-                              int& index)
-{
-  return lc->list_lc_progress(marker, max_entries, progress_map, index);
-}
-
-int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
-{
-  RGWLC lc;
-  lc.initialize(cct, this->driver);
-  RGWLC::LCWorker worker(&lc, cct, &lc, 0);
-  auto ret = lc.process(&worker, optional_bucket, true /* once */);
-  lc.stop_processor(); // sets down_flag, but returns immediately
-  return ret;
-}
-
-bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
-{
-  return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
-}
-
-int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
-                                 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
-{
-  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx;
-  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
-  rgw_zone_set zones_trace;
-  if (_zones_trace) {
-    zones_trace = *_zones_trace;
-  }
-  zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
-
-  ObjectWriteOperation o;
-  o.assert_exists(); // bucket index shard must exist
-
-  cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
-  cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
-  cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
-  int ret = bs.bucket_obj.operate(dpp, &o, y);
-  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
-  return ret;
-}
-
-int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
-                                  int64_t pool, uint64_t epoch,
-                                  rgw_bucket_dir_entry& ent, RGWObjCategory category,
-                                 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
-{
-  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-  ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs <<
-    " obj=" << obj << " tag=" << tag << " op=" << op <<
-    ", remove_objs=" << (remove_objs ? *remove_objs : std::list<rgw_obj_index_key>()) << dendl_bitx;
-  ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
-  ObjectWriteOperation o;
-  o.assert_exists(); // bucket index shard must exist
-
-  rgw_bucket_dir_entry_meta dir_meta;
-  dir_meta = ent.meta;
-  dir_meta.category = category;
-
-  rgw_zone_set zones_trace;
-  if (_zones_trace) {
-    zones_trace = *_zones_trace;
-  }
-  zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
-
-  rgw_bucket_entry_ver ver;
-  ver.pool = pool;
-  ver.epoch = epoch;
-  cls_rgw_obj_key key(ent.key.name, ent.key.instance);
-  cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
-  cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
-                             svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
-  complete_op_data *arg;
-  index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
-                                              svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
-  librados::AioCompletion *completion = arg->rados_completion;
-  int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
-  completion->release(); /* can't reference arg here, as it might have already been released */
-
-  ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
-  return ret;
-}
-
-int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
-                                   int64_t pool, uint64_t epoch,
-                                   rgw_bucket_dir_entry& ent, RGWObjCategory category,
-                                   list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
-{
-  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
-}
-
-int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
-                                   int64_t pool, uint64_t epoch,
-                                   rgw_obj& obj,
-                                   real_time& removed_mtime,
-                                   list<rgw_obj_index_key> *remove_objs,
-                                   uint16_t bilog_flags,
-                                   rgw_zone_set *zones_trace)
-{
-  rgw_bucket_dir_entry ent;
-  ent.meta.mtime = removed_mtime;
-  obj.key.get_index_key(&ent.key);
-  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
-                            ent, RGWObjCategory::None, remove_objs,
-                            bilog_flags, zones_trace);
-}
-
-int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
-                                      list<rgw_obj_index_key> *remove_objs,
-                                      uint16_t bilog_flags, rgw_zone_set *zones_trace)
-{
-  rgw_bucket_dir_entry ent;
-  obj.key.get_index_key(&ent.key);
-  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
-                            -1 /* pool id */, 0, ent,
-                            RGWObjCategory::None, remove_objs, bilog_flags,
-                            zones_trace);
-}
-
-int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
-{
-  RGWSI_RADOS::Pool index_pool;
-  map<int, string> bucket_objs;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
-  if (r < 0)
-    return r;
-
-  return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
-}
-
-
-// returns 0 if there is an error in calculation
-uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
-                                                     uint32_t num_shards)
-{
-  if (num_shards == 0) {
-    // we'll get a floating point exception since we divide by
-    // num_shards
-    return 0;
-  }
-
-  // We want to minimize the chances that when num_shards >>
-  // num_entries that we return much fewer than num_entries to the
-  // client. Given all the overhead of making a cls call to the osd,
-  // returning a few entries is not much more work than returning one
-  // entry. This minimum might be better tuned based on future
-  // experiments where num_shards >> num_entries. (Note: ">>" should
-  // be interpreted as "much greater than".)
-  constexpr uint32_t min_read = 8;
-
-  // The following is based on _"Balls into Bins" -- A Simple and
-  // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
-  // cases when num_shards >> num_entries (it almost serves as a
-  // ceiling calculation). We also assume alpha is 1.0 and extract it
-  // from the calculation. Future work could involve memoizing some of
-  // the transcendental functions to minimize repeatedly re-calling
-  // them with the same parameters, which we expect to be the case the
-  // majority of the time.
-  uint32_t calc_read =
-    1 +
-    static_cast<uint32_t>((num_entries / num_shards) +
-                         sqrt((2 * num_entries) *
-                              log(num_shards) / num_shards));
-
-  return std::max(min_read, calc_read);
-}
-
-
-int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
-                                      RGWBucketInfo& bucket_info,
-                                      const rgw::bucket_index_layout_generation& idx_layout,
-                                      const int shard_id,
-                                     const rgw_obj_index_key& start_after,
-                                     const std::string& prefix,
-                                     const std::string& delimiter,
-                                     const uint32_t num_entries,
-                                     const bool list_versions,
-                                     const uint16_t expansion_factor,
-                                     ent_map_t& m,
-                                     bool* is_truncated,
-                                     bool* cls_filtered,
-                                     rgw_obj_index_key* last_entry,
-                                      optional_yield y,
-                                     RGWBucketListNameFilter force_check_filter)
-{
-  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-
-  /* expansion_factor allows the number of entries to read to grow
-   * exponentially; this is used when earlier reads are producing too
-   * few results, perhaps due to filtering or to a series of
-   * namespaced entries */
-
-  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
-    " start_after=\"" << start_after.to_string() <<
-    "\", prefix=\"" << prefix <<
-    ", delimiter=\"" << delimiter <<
-    "\", shard_id=" << shard_id <<
-    "\", num_entries=" << num_entries <<
-    ", shard_id=" << shard_id <<
-    ", list_versions=" << list_versions <<
-    ", expansion_factor=" << expansion_factor <<
-    ", force_check_filter is " <<
-    (force_check_filter ? "set" : "unset") << dendl_bitx;
-  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
-  m.clear();
-
-  RGWSI_RADOS::Pool index_pool;
-  // key   - oid (for different shards if there is any)
-  // value - list result for the corresponding oid (shard), it is filled by
-  //         the AIO callback
-  std::map<int, std::string> shard_oids;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout,
-                                         &index_pool, &shard_oids,
-                                         nullptr);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << __func__ <<
-      ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
-    return r;
-  }
-
-  const uint32_t shard_count = shard_oids.size();
-  if (shard_count == 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-      ": the bucket index shard count appears to be 0, "
-      "which is an illegal value" << dendl;
-    return -ERR_INVALID_BUCKET_STATE;
-  }
-
-  uint32_t num_entries_per_shard;
-  if (expansion_factor == 0) {
-    num_entries_per_shard =
-      calc_ordered_bucket_list_per_shard(num_entries, shard_count);
-  } else if (expansion_factor <= 11) {
-    // we'll max out the exponential multiplication factor at 1024 (2<<10)
-    num_entries_per_shard =
-      std::min(num_entries,
-              (uint32_t(1 << (expansion_factor - 1)) *
-               calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
-  } else {
-    num_entries_per_shard = num_entries;
-  }
-
-  if (num_entries_per_shard == 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-      ": unable to calculate the number of entries to read from each "
-      "bucket index shard" << dendl;
-    return -ERR_INVALID_BUCKET_STATE;
-  }
-
-  ldpp_dout(dpp, 10) << __func__ <<
-    ": request from each of " << shard_count <<
-    " shard(s) for " << num_entries_per_shard << " entries to get " <<
-    num_entries << " total entries" << dendl;
-
-  auto& ioctx = index_pool.ioctx();
-  std::map<int, rgw_cls_list_ret> shard_list_results;
-  cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
-  r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
-                           num_entries_per_shard,
-                           list_versions, shard_oids, shard_list_results,
-                           cct->_conf->rgw_bucket_index_max_aio)();
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << __func__ <<
-      ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
-      " failed" << dendl;
-    return r;
-  }
-
-  // to manage the iterators through each shard's list results
-  struct ShardTracker {
-    const size_t shard_idx;
-    rgw_cls_list_ret& result;
-    const std::string& oid_name;
-    RGWRados::ent_map_t::iterator cursor;
-    RGWRados::ent_map_t::iterator end;
-
-    // manages an iterator through a shard and provides other
-    // accessors
-    ShardTracker(size_t _shard_idx,
-                rgw_cls_list_ret& _result,
-                const std::string& _oid_name):
-      shard_idx(_shard_idx),
-      result(_result),
-      oid_name(_oid_name),
-      cursor(_result.dir.m.begin()),
-      end(_result.dir.m.end())
-    {}
-
-    inline const std::string& entry_name() const {
-      return cursor->first;
-    }
-    rgw_bucket_dir_entry& dir_entry() const {
-      return cursor->second;
-    }
-    inline bool is_truncated() const {
-      return result.is_truncated;
-    }
-    inline ShardTracker& advance() {
-      ++cursor;
-      // return a self-reference to allow for chaining of calls, such
-      // as x.advance().at_end()
-      return *this;
-    }
-    inline bool at_end() const {
-      return cursor == end;
-    }
-  }; // ShardTracker
-
-  // add the next unique candidate, or return false if we reach the end
-  auto next_candidate = [] (CephContext *cct, ShardTracker& t,
-                            std::multimap<std::string, size_t>& candidates,
-                            size_t tracker_idx) {
-    if (!t.at_end()) {
-      candidates.emplace(t.entry_name(), tracker_idx);
-    }
-    return;
-  };
-
-  // one tracker per shard requested (may not be all shards)
-  std::vector<ShardTracker> results_trackers;
-  results_trackers.reserve(shard_list_results.size());
-  for (auto& r : shard_list_results) {
-    results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
-
-    // if any *one* shard's result is trucated, the entire result is
-    // truncated
-    *is_truncated = *is_truncated || r.second.is_truncated;
-
-    // unless *all* are shards are cls_filtered, the entire result is
-    // not filtered
-    *cls_filtered = *cls_filtered && r.second.cls_filtered;
-  }
-
-  // create a map to track the next candidate entry from ShardTracker
-  // (key=candidate, value=index into results_trackers); as we consume
-  // entries from shards, we replace them with the next entries in the
-  // shards until we run out
-  std::multimap<std::string, size_t> candidates;
-  size_t tracker_idx = 0;
-  std::vector<size_t> vidx;
-  vidx.reserve(shard_list_results.size());
-  for (auto& t : results_trackers) {
-    // it's important that the values in the map refer to the index
-    // into the results_trackers vector, which may not be the same
-    // as the shard number (i.e., when not all shards are requested)
-    next_candidate(cct, t, candidates, tracker_idx);
-    ++tracker_idx;
-  }
-
-  rgw_bucket_dir_entry*
-    last_entry_visited = nullptr; // to set last_entry (marker)
-  std::map<std::string, bufferlist> updates;
-  uint32_t count = 0;
-  while (count < num_entries && !candidates.empty()) {
-    r = 0;
-    // select the next entry in lexical order (first key in map);
-    // again tracker_idx is not necessarily shard number, but is index
-    // into results_trackers vector
-    tracker_idx = candidates.begin()->second;
-    auto& tracker = results_trackers.at(tracker_idx);
-
-    const std::string& name = tracker.entry_name();
-    rgw_bucket_dir_entry& dirent = tracker.dir_entry();
-
-    ldpp_dout(dpp, 20) << __func__ << ": currently processing " <<
-      dirent.key << " from shard " << tracker.shard_idx << dendl;
-
-    const bool force_check =
-      force_check_filter && force_check_filter(dirent.key.name);
-
-    if ((!dirent.exists &&
-        !dirent.is_delete_marker() &&
-        !dirent.is_common_prefix()) ||
-        !dirent.pending_map.empty() ||
-        force_check) {
-      /* there are uncommitted ops. We need to check the current
-       * state, and if the tags are old we need to do clean-up as
-       * well. */
-      librados::IoCtx sub_ctx;
-      sub_ctx.dup(ioctx);
-      ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
-       " calling check_disk_state bucket=" << bucket_info.bucket <<
-       " entry=" << dirent.key << dendl_bitx;
-      r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
-                          updates[tracker.oid_name], y);
-      if (r < 0 && r != -ENOENT) {
-       ldpp_dout(dpp, 0) << __func__ <<
-         ": check_disk_state for \"" << dirent.key <<
-         "\" failed with r=" << r << dendl;
-       return r;
-      }
-    } else {
-      r = 0;
-    }
-
-    // at this point either r >= 0 or r == -ENOENT
-    if (r >= 0) { // i.e., if r != -ENOENT
-      ldpp_dout(dpp, 10) << __func__ << ": got " <<
-       dirent.key << dendl;
-
-      auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
-      last_entry_visited = &it->second;
-      if (inserted) {
-       ++count;
-      } else {
-       ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
-         " reassigned map value at \"" << name <<
-         "\", which should not happen" << dendl;
-      }
-    } else {
-      ldpp_dout(dpp, 10) << __func__ << ": skipping " <<
-       dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
-      last_entry_visited = &tracker.dir_entry();
-    }
-
-    // refresh the candidates map
-    vidx.clear();
-    bool need_to_stop = false;
-    auto range = candidates.equal_range(name);
-    for (auto i = range.first; i != range.second; ++i) {
-      vidx.push_back(i->second);
-    } 
-    candidates.erase(range.first, range.second);
-    for (auto idx : vidx) {
-      auto& tracker_match = results_trackers.at(idx);
-      tracker_match.advance();
-      next_candidate(cct, tracker_match, candidates, idx);
-      if (tracker_match.at_end() && tracker_match.is_truncated()) {
-        need_to_stop = true;
-        break;
-      }
-    }
-    if (need_to_stop) {
-      // once we exhaust one shard that is truncated, we need to stop,
-      // as we cannot be certain that one of the next entries needs to
-      // come from that shard; S3 and swift protocols allow returning
-      // fewer than what was requested
-      ldpp_dout(dpp, 10) << __func__ <<
-       ": stopped accumulating results at count=" << count <<
-       ", dirent=\"" << dirent.key <<
-       "\", because its shard is truncated and exhausted" << dendl;
-      break;
-    }
-  } // while we haven't provided requested # of result entries
-
-  // suggest updates if there are any
-  for (auto& miter : updates) {
-    if (miter.second.length()) {
-      ObjectWriteOperation o;
-      cls_rgw_suggest_changes(o, miter.second);
-      // we don't care if we lose suggested updates, send them off blindly
-      AioCompletion *c =
-       librados::Rados::aio_create_completion(nullptr, nullptr);
-
-      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
-       ": doing dir_suggest on " << miter.first << dendl_bitx;
-      ioctx.aio_operate(miter.first, c, &o);
-      c->release();
-    }
-  } // updates loop
-
-  // determine truncation by checking if all the returned entries are
-  // consumed or not
-  *is_truncated = false;
-  for (const auto& t : results_trackers) {
-    if (!t.at_end() || t.is_truncated()) {
-      *is_truncated = true;
-      break;
-    }
-  }
-
-  ldpp_dout(dpp, 20) << __func__ <<
-    ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
-    dendl;
-
-  if (*is_truncated && count < num_entries) {
-    ldpp_dout(dpp, 10) << __func__ <<
-      ": requested " << num_entries << " entries but returning " <<
-      count << ", which is truncated" << dendl;
-  }
-
-  if (last_entry_visited != nullptr && last_entry) {
-    *last_entry = last_entry_visited->key;
-    ldpp_dout(dpp, 20) << __func__ <<
-      ": returning, last_entry=" << *last_entry << dendl;
-  } else {
-    ldpp_dout(dpp, 20) << __func__ <<
-      ": returning, last_entry NOT SET" << dendl;
-  }
-
-  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
-  return 0;
-} // RGWRados::cls_bucket_list_ordered
-
-
-// A helper function to retrieve the hash source from an incomplete
-// multipart entry by removing everything from the second to last
-// period on.
-static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
-  std::size_t found = oid_wo_ns.rfind('.');
-  if (found == std::string::npos || found < 1) {
-    return -EINVAL;
-  }
-  found = oid_wo_ns.rfind('.', found - 1);
-  if (found == std::string::npos || found < 1) {
-    return -EINVAL;
-  }
-  *index_hash_source = oid_wo_ns.substr(0, found);
-  return 0;
-}
-
-
-int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
-                                        RGWBucketInfo& bucket_info,
-                                        const rgw::bucket_index_layout_generation& idx_layout,
-                                        int shard_id,
-                                       const rgw_obj_index_key& start_after,
-                                       const std::string& prefix,
-                                       uint32_t num_entries,
-                                       bool list_versions,
-                                       std::vector<rgw_bucket_dir_entry>& ent_list,
-                                       bool *is_truncated,
-                                       rgw_obj_index_key *last_entry,
-                                        optional_yield y,
-                                       RGWBucketListNameFilter force_check_filter) {
-  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-
-  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
-    " start_after=\"" << start_after <<
-    "\", prefix=\"" << prefix <<
-    "\", shard_id=" << shard_id <<
-    "\", num_entries=" << num_entries <<
-    ", list_versions=" << list_versions <<
-    (force_check_filter ? "set" : "unset") << dendl_bitx;
-  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
-  ent_list.clear();
-  static MultipartMetaFilter multipart_meta_filter;
-
-  *is_truncated = false;
-  RGWSI_RADOS::Pool index_pool;
-
-  std::map<int, std::string> oids;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  auto& ioctx = index_pool.ioctx();
-
-  const uint32_t num_shards = oids.size();
-
-  rgw_obj_index_key marker = start_after;
-  uint32_t current_shard;
-  if (shard_id >= 0) {
-    current_shard = shard_id;
-  } else if (start_after.empty()) {
-    current_shard = 0u;
-  } else {
-    // at this point we have a marker (start_after) that has something
-    // in it, so we need to get to the bucket shard index, so we can
-    // start reading from there
-
-
-    // now convert the key (oid) to an rgw_obj_key since that will
-    // separate out the namespace, name, and instance
-    rgw_obj_key obj_key;
-    bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
-    if (!parsed) {
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-       " received an invalid start marker: \"" << start_after << "\"" <<
-       dendl;
-      return -EINVAL;
-    } else if (obj_key.name.empty()) {
-      // if the name is empty that means the object name came in with
-      // a namespace only, and therefore we need to start our scan at
-      // the first bucket index shard
-      current_shard = 0u;
-    } else {
-      // so now we have the key used to compute the bucket index shard
-      // and can extract the specific shard from it
-      if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
-        // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
-        // the implementation relying on MultipartMetaFilter
-        // because MultipartMetaFilter only checks .meta suffix, which may
-        // exclude data multiparts but include some regular objects with .meta suffix
-        // by mistake.
-        string index_hash_source;
-        r = parse_index_hash_source(obj_key.name, &index_hash_source);
-        if (r < 0) {
-         ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-           " parse_index_hash_source unable to parse \"" << obj_key.name <<
-           "\", r=" << r << dendl;
-          return r;
-        }
-        current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
-      } else {
-        current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
-      }
-    }
-  }
-
-  uint32_t count = 0u;
-  std::map<std::string, bufferlist> updates;
-  rgw_obj_index_key last_added_entry;
-  while (count <= num_entries &&
-        ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
-         current_shard < num_shards)) {
-    const std::string& oid = oids[current_shard];
-    rgw_cls_list_ret result;
-
-    librados::ObjectReadOperation op;
-    const std::string empty_delimiter;
-    cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
-                          num_entries,
-                           list_versions, &result);
-    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-       ": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
-      return r;
-    }
-
-    for (auto& entry : result.dir.m) {
-      rgw_bucket_dir_entry& dirent = entry.second;
-
-      bool force_check = force_check_filter &&
-       force_check_filter(dirent.key.name);
-      if ((!dirent.exists && !dirent.is_delete_marker()) ||
-         !dirent.pending_map.empty() ||
-         force_check) {
-       /* there are uncommitted ops. We need to check the current state,
-        * and if the tags are old we need to do cleanup as well. */
-       librados::IoCtx sub_ctx;
-       sub_ctx.dup(ioctx);
-       ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
-         ": calling check_disk_state bucket=" << bucket_info.bucket <<
-         " entry=" << dirent.key << dendl_bitx;
-       r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
-       if (r < 0 && r != -ENOENT) {
-         ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
-           ": error in check_disk_state, r=" << r << dendl;
-         return r;
-       }
-      } else {
-        r = 0;
-      }
-
-      // at this point either r >= 0 or r == -ENOENT
-      if (r >= 0) { // i.e., if r != -ENOENT
-       ldpp_dout(dpp, 10) << __func__ << ": got " <<
-         dirent.key << dendl;
-
-       if (count < num_entries) {
-         marker = last_added_entry = dirent.key; // double assign
-         ent_list.emplace_back(std::move(dirent));
-         ++count;
-       } else {
-         last_added_entry = dirent.key;
-         *is_truncated = true;
-         ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
-           ": reached max entries (" << num_entries << ") to return at \"" <<
-           dirent.key << "\"" << dendl;
-         goto check_updates;
-       }
-      } else { // r == -ENOENT
-       // in the case of -ENOENT, make sure we're advancing marker
-       // for possible next call to CLSRGWIssueBucketList
-       marker = dirent.key;
-      }
-    } // entry for loop
-
-    if (!result.is_truncated) {
-      // if we reached the end of the shard read next shard
-      ++current_shard;
-      marker = rgw_obj_index_key();
-    }
-  } // shard loop
-
-check_updates:
-
-  // suggest updates if there is any
-  std::map<std::string, bufferlist>::iterator miter = updates.begin();
-  for (; miter != updates.end(); ++miter) {
-    if (miter->second.length()) {
-      ObjectWriteOperation o;
-      cls_rgw_suggest_changes(o, miter->second);
-      // we don't care if we lose suggested updates, send them off blindly
-      AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
-
-      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
-       " doing dir_suggest on " << miter->first << dendl_bitx;
-      ioctx.aio_operate(miter->first, c, &o);
-      c->release();
-    }
-  }
-
-  if (last_entry && !ent_list.empty()) {
-    *last_entry = last_added_entry;
-  }
-
-  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
-  return 0;
-} // RGWRados::cls_bucket_list_unordered
-
-
-int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
-                                   rgw_usage_log_info& info)
-{
-  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  ObjectWriteOperation op;
-  cls_rgw_usage_log_add(op, info);
-
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-  return r;
-}
-
-int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
-                                     uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
-                                     string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
-                                    bool *is_truncated)
-{
-  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  *is_truncated = false;
-
-  r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
-                            max_entries, read_iter, usage, is_truncated);
-
-  return r;
-}
-
-static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
-{
-  bool done = false;
-  do {
-    librados::ObjectWriteOperation op;
-    cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
-    int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-    if (r == -ENODATA)
-      done = true;
-    else if (r < 0)
-      return r;
-  } while (!done);
-
-  return 0;
-}
-
-int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
-                                    uint64_t start_epoch, uint64_t end_epoch)
-{
-  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-
-  r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
-  return r;
-}
-
-int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
-{
-  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
-  rgw_rados_ref ref;
-  int r = get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    return r;
-  }
-  librados::ObjectWriteOperation op;
-  cls_rgw_usage_log_clear(op);
-  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
-  return r;
-}
-
-
-// note: this removes entries from the rados bucket index objects
-// without going through CLS; this is known to be called from
-// "radosgw-admin unlink" and "radosgw-admin bucket check --fix"
-int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
-                                    RGWBucketInfo& bucket_info,
-                                    const std::list<rgw_obj_index_key>& entry_key_list)
-{
-  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket <<
-    " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx;
-  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
-  const auto& current_index = bucket_info.get_current_index();
-  if (is_layout_indexless(current_index)) {
-    return -EINVAL;
-  }
-  const uint32_t num_shards = current_index.layout.normal.num_shards;
-
-  RGWSI_RADOS::Pool index_pool;
-  std::map<int, std::string> index_oids;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
-                                         bucket_info.layout.current_index,
-                                         &index_pool, &index_oids, nullptr);
-  if (r < 0) {
-    ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
-      " open_bucket_index returned " << r << dendl_bitx;
-    return r;
-  }
-
-  // split up removals by shard
-  std::map<int, std::set<std::string>> sharded_removals;
-  for (const auto& entry_key : entry_key_list) {
-    const rgw_obj_key obj_key(entry_key);
-    const uint32_t shard =
-      RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
-
-    // entry_key already combines namespace and name, so we first have
-    // to break that apart before we can then combine with instance
-    std::string name;
-    std::string ns; // namespace
-    rgw_obj_key::parse_index_key(entry_key.name, &name, &ns);
-    rgw_obj_key full_key(name, entry_key.instance, ns);
-    std::string combined_key = full_key.get_oid();
-
-    sharded_removals[shard].insert(combined_key);
-
-    ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
-      ": removal from bucket index, bucket=" << bucket_info.bucket <<
-      " key=" << combined_key << " designated for shard " << shard <<
-      dendl_bitx;
-  }
-
-  for (const auto& removals : sharded_removals) {
-    const int shard = removals.first;
-    const std::string& oid = index_oids[shard];
-
-    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
-      ": removal from bucket index, bucket=" << bucket_info.bucket <<
-      ", shard=" << shard << ", oid=" << oid << ", num_keys=" <<
-      removals.second.size() << dendl_bitx;
-
-    r = index_pool.ioctx().omap_rm_keys(oid, removals.second);
-    if (r < 0) {
-      ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
-       ": omap_rm_keys returned ret=" << r <<
-       dendl_bitx;
-      return r;
-    }
-  }
-
-  ldout_bitx(bitx, dpp, 5) <<
-    "EXITING " << __func__ << " and returning " << r << dendl_bitx;
-
-  return r;
-}
-
-int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
-                               librados::IoCtx io_ctx,
-                               RGWBucketInfo& bucket_info,
-                               rgw_bucket_dir_entry& list_state,
-                               rgw_bucket_dir_entry& object,
-                               bufferlist& suggested_updates,
-                               optional_yield y)
-{
-  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" <<
-    bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx;
-
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  driver->get_bucket(nullptr, bucket_info, &bucket);
-  uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
-
-  std::string loc;
-
-  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(list_state.key);
-  MultipartMetaFilter multipart_meta_filter;
-  string temp_key;
-  if (multipart_meta_filter.filter(list_state.key.name, temp_key)) {
-    obj->set_in_extra_data(true);
-  }
-
-  string oid;
-  get_obj_bucket_and_oid_loc(obj->get_obj(), oid, loc);
-
-  if (loc != list_state.locator) {
-    ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
-  }
-
-  io_ctx.locator_set_key(list_state.locator);
-
-  RGWObjState *astate = NULL;
-  RGWObjManifest *manifest = nullptr;
-  RGWObjectCtx rctx(this->driver);
-  int r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
-  if (r < 0)
-    return r;
-
-  list_state.pending_map.clear(); // we don't need this and it inflates size
-  if (!list_state.is_delete_marker() && !astate->exists) {
-    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx;
-      /* object doesn't exist right now -- hopefully because it's
-       * marked as !exists and got deleted */
-    if (list_state.exists) {
-      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx;
-      /* FIXME: what should happen now? Work out if there are any
-       * non-bad ways this could happen (there probably are, but annoying
-       * to handle!) */
-    }
-
-    // encode a suggested removal of that key
-    list_state.ver.epoch = io_ctx.get_last_version();
-    list_state.ver.pool = io_ctx.get_id();
-    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
-    cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
-    return -ENOENT;
-  }
-
-  string etag;
-  string content_type;
-  string storage_class;
-  ACLOwner owner;
-  bool appendable = false;
-
-  object.meta.size = astate->size;
-  object.meta.accounted_size = astate->accounted_size;
-  object.meta.mtime = astate->mtime;
-
-  map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
-  if (iter != astate->attrset.end()) {
-    etag = rgw_bl_str(iter->second);
-  }
-  iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
-  if (iter != astate->attrset.end()) {
-    content_type = rgw_bl_str(iter->second);
-  }
-  iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
-  if (iter != astate->attrset.end()) {
-    storage_class = rgw_bl_str(iter->second);
-  }
-  iter = astate->attrset.find(RGW_ATTR_ACL);
-  if (iter != astate->attrset.end()) {
-    r = decode_policy(dpp, iter->second, &owner);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
-    }
-  }
-  iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
-  if (iter != astate->attrset.end()) {
-    appendable = true;
-  }
-
-  if (manifest) {
-    RGWObjManifest::obj_iterator miter;
-    for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
-      const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(driver);
-      rgw_obj loc;
-      RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc);
-
-      if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
-       ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx;
-       r = delete_obj_index(loc, astate->mtime, dpp);
-       if (r < 0) {
-         ldout_bitx(bitx, dpp, 0) <<
-           "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx;
-       }
-      }
-    }
-  }
-
-  object.meta.etag = etag;
-  object.meta.content_type = content_type;
-  object.meta.storage_class = storage_class;
-  object.meta.owner = owner.get_id().to_str();
-  object.meta.owner_display_name = owner.get_display_name();
-  object.meta.appendable = appendable;
-
-  // encode suggested updates
-
-  list_state.meta.size = object.meta.size;
-  list_state.meta.accounted_size = object.meta.accounted_size;
-  list_state.meta.mtime = object.meta.mtime;
-  list_state.meta.category = main_category;
-  list_state.meta.etag = etag;
-  list_state.meta.appendable = appendable;
-  list_state.meta.content_type = content_type;
-  list_state.meta.storage_class = storage_class;
-
-  librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
-  r = get_obj_head_ioctx(dpp, bucket_info, obj->get_obj(), &head_obj_ctx);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << __func__ <<
-      " WARNING: unable to find head object data pool for \"" <<
-      obj << "\", not updating version pool/epoch" << dendl;
-  } else {
-    list_state.ver.pool = head_obj_ctx.get_id();
-    list_state.ver.epoch = astate->epoch;
-  }
-
-  if (astate->obj_tag.length() > 0) {
-    list_state.tag = astate->obj_tag.c_str();
-  }
-
-  list_state.meta.owner = owner.get_id().to_str();
-  list_state.meta.owner_display_name = owner.get_display_name();
-
-  list_state.exists = true;
-
-  ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
-    ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx;
-  cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
-
-  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
-  return 0;
-} // RGWRados::check_disk_state
-
-int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
-{
-  RGWSI_RADOS::Pool index_pool;
-  map<int, string> oids;
-  map<int, struct rgw_cls_list_ret> list_results;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
-  if (r < 0) {
-    ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
-                   << r << dendl;
-    return r;
-  }
-
-  r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
-  if (r < 0) {
-    ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
-                   << r << dendl;
-    return r;
-  }
-
-  map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
-  for(; iter != list_results.end(); ++iter) {
-    headers.push_back(std::move(iter->second.dir.header));
-  }
-  return 0;
-}
-
-int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
-{
-  RGWSI_RADOS::Pool index_pool;
-  map<int, string> bucket_objs;
-  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr);
-  if (r < 0)
-    return r;
-
-  map<int, string>::iterator iter = bucket_objs.begin();
-  for (; iter != bucket_objs.end(); ++iter) {
-    r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
-    if (r < 0) {
-      ctx->put();
-      break;
-    } else {
-      (*num_aio)++;
-    }
-  }
-  return r;
-}
-
-int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
-                                 const rgw_bucket& bucket,
-                                 uint64_t num_objs,
-                                  const DoutPrefixProvider *dpp)
-{
-  if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
-      return 0;
-  }
-
-  bool need_resharding = false;
-  uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
-  const uint32_t max_dynamic_shards =
-    uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
-
-  if (num_source_shards >= max_dynamic_shards) {
-    return 0;
-  }
-
-  uint32_t suggested_num_shards = 0;
-  const uint64_t max_objs_per_shard =
-    cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
-
-  // TODO: consider per-bucket sync policy here?
-  const bool is_multisite = svc.zone->get_zone().log_data;
-
-  quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
-                                    num_objs, is_multisite, need_resharding,
-                                    &suggested_num_shards);
-  if (! need_resharding) {
-    return 0;
-  }
-
-  const uint32_t final_num_shards =
-    RGWBucketReshard::get_preferred_shards(suggested_num_shards,
-                                          max_dynamic_shards);
-  // final verification, so we don't reduce number of shards
-  if (final_num_shards <= num_source_shards) {
-    return 0;
-  }
-
-  ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
-    " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
-    "; new num shards " << final_num_shards << " (suggested " <<
-    suggested_num_shards << ")" << dendl;
-
-  return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
-}
-
-int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
-{
-  RGWReshard reshard(this->driver, dpp);
-
-  uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
-
-  new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
-  if (new_num_shards <= num_source_shards) {
-    ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
-    return 0;
-  }
-
-  cls_rgw_reshard_entry entry;
-  entry.time = real_clock::now();
-  entry.tenant = bucket_info.owner.tenant;
-  entry.bucket_name = bucket_info.bucket.name;
-  entry.bucket_id = bucket_info.bucket.bucket_id;
-  entry.old_num_shards = num_source_shards;
-  entry.new_num_shards = new_num_shards;
-
-  return reshard.add(dpp, entry);
-}
-
-int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
-                          RGWQuota& quota,
-                         uint64_t obj_size, optional_yield y,
-                         bool check_size_only)
-{
-  // if we only check size, then num_objs will set to 0
-  if(check_size_only)
-    return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y);
-
-  return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y);
-}
-
-int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
-                                  int *shard_id)
-{
-  int r = 0;
-  switch (layout.hash_type) {
-    case rgw::BucketHashType::Mod:
-      if (!layout.num_shards) {
-        if (shard_id) {
-          *shard_id = -1;
-        }
-      } else {
-        uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
-        if (shard_id) {
-          *shard_id = (int)sid;
-        }
-      }
-      break;
-    default:
-      r = -ENOTSUP;
-  }
-  return r;
-}
-
-uint64_t RGWRados::instance_id()
-{
-  return get_rados_handle()->get_instance_id();
-}
-
-uint64_t RGWRados::next_bucket_id()
-{
-  std::lock_guard l{bucket_id_lock};
-  return ++max_bucket_id;
-}
-
-librados::Rados* RGWRados::get_rados_handle()
-{
-  return &rados;
-}
-
-int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
-{
-  rgw_rados_ref ref;
-  int ret = get_raw_obj_ref(dpp, obj, &ref);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
-    return ret;
-  }
-
-  ObjectWriteOperation op;
-  list<string> prefixes;
-  cls_rgw_remove_obj(op, prefixes);
-
-  AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
-  ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
-    c->release();
-    return ret;
-  }
-
-  handles.push_back(c);
-
-  return 0;
-}
-
-int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
-                             RGWBucketInfo& bucket_info, RGWObjState *astate,
-                             list<librados::AioCompletion *>& handles, bool keep_index_consistent,
-                             optional_yield y)
-{
-  rgw_rados_ref ref;
-  int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
-    return ret;
-  }
-
-  if (keep_index_consistent) {
-    RGWRados::Bucket bop(this, bucket_info);
-    RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
-
-    ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
-      return ret;
-    }
-  }
-
-  ObjectWriteOperation op;
-  list<string> prefixes;
-  cls_rgw_remove_obj(op, prefixes);
-
-  AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
-  ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
-    c->release();
-    return ret;
-  }
-
-  handles.push_back(c);
-
-  if (keep_index_consistent) {
-    ret = delete_obj_index(obj, astate->mtime, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
-      return ret;
-    }
-  }
-  return ret;
-}
-
-void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
-{
-  auto it = new objexp_hint_entry;
-  it->tenant = "tenant1";
-  it->bucket_name = "bucket1";
-  it->bucket_id = "1234";
-  it->obj_key = rgw_obj_key("obj");
-  o.push_back(it);
-  o.push_back(new objexp_hint_entry);
-}
-
-void objexp_hint_entry::dump(Formatter *f) const
-{
-  f->open_object_section("objexp_hint_entry");
-  encode_json("tenant", tenant, f);
-  encode_json("bucket_name", bucket_name, f);
-  encode_json("bucket_id", bucket_id, f);
-  encode_json("rgw_obj_key", obj_key, f);
-  utime_t ut(exp_time);
-  encode_json("exp_time", ut, f);
-  f->close_section();
-}
-
-void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
-{
-  RGWOLHInfo *olh = new RGWOLHInfo;
-  olh->removed = false;
-  o.push_back(olh);
-  o.push_back(new RGWOLHInfo);
-}
-
-void RGWOLHInfo::dump(Formatter *f) const
-{
-  encode_json("target", target, f);
-}
-
-void RGWOLHPendingInfo::dump(Formatter *f) const
-{
-  utime_t ut(time);
-  encode_json("time", ut, f);
-}
-
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h

deleted file mode 100644 (file)

index 1b52951..0000000
--- a/src/rgw/rgw_rados.h
+++ /dev/null
@@ -1,1636 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGWRADOS_H
-#define CEPH_RGWRADOS_H
-
-#include <iostream>
-#include <functional>
-#include <boost/container/flat_map.hpp>
-#include <boost/container/flat_set.hpp>
-
-#include "include/rados/librados.hpp"
-#include "include/Context.h"
-#include "include/random.h"
-#include "common/RefCountedObj.h"
-#include "common/ceph_time.h"
-#include "common/Timer.h"
-#include "rgw_common.h"
-#include "cls/rgw/cls_rgw_types.h"
-#include "cls/version/cls_version_types.h"
-#include "cls/log/cls_log_types.h"
-#include "cls/timeindex/cls_timeindex_types.h"
-#include "cls/otp/cls_otp_types.h"
-#include "rgw_quota.h"
-#include "rgw_log.h"
-#include "rgw_metadata.h"
-#include "rgw_meta_sync_status.h"
-#include "rgw_period_puller.h"
-#include "rgw_obj_manifest.h"
-#include "rgw_sync_module.h"
-#include "rgw_trim_bilog.h"
-#include "rgw_service.h"
-#include "rgw_sal.h"
-#include "rgw_aio.h"
-#include "rgw_d3n_cacherequest.h"
-
-#include "services/svc_rados.h"
-#include "services/svc_bi_rados.h"
-#include "common/Throttle.h"
-#include "common/ceph_mutex.h"
-#include "rgw_cache.h"
-#include "rgw_sal_fwd.h"
-
-struct D3nDataCache;
-
-class RGWWatcher;
-class ACLOwner;
-class RGWGC;
-class RGWMetaNotifier;
-class RGWDataNotifier;
-class RGWLC;
-class RGWObjectExpirer;
-class RGWMetaSyncProcessorThread;
-class RGWDataSyncProcessorThread;
-class RGWSyncLogTrimThread;
-class RGWSyncTraceManager;
-struct RGWZoneGroup;
-struct RGWZoneParams;
-class RGWReshard;
-class RGWReshardWait;
-
-struct get_obj_data;
-
-/* flags for put_obj_meta() */
-#define PUT_OBJ_CREATE      0x01
-#define PUT_OBJ_EXCL        0x02
-#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL)
-
-static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid)
-{
-  if (bucket.marker.empty() || orig_oid.empty()) {
-    oid = orig_oid;
-  } else {
-    oid = bucket.marker;
-    oid.append("_");
-    oid.append(orig_oid);
-  }
-}
-
-static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator)
-{
-  const rgw_bucket& bucket = obj.bucket;
-  prepend_bucket_marker(bucket, obj.get_oid(), oid);
-  const std::string& loc = obj.key.get_loc();
-  if (!loc.empty()) {
-    prepend_bucket_marker(bucket, loc, locator);
-  } else {
-    locator.clear();
-  }
-}
-
-struct RGWOLHInfo {
-  rgw_obj target;
-  bool removed;
-
-  RGWOLHInfo() : removed(false) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(target, bl);
-    encode(removed, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(1, bl);
-     decode(target, bl);
-     decode(removed, bl);
-     DECODE_FINISH(bl);
-  }
-  static void generate_test_instances(std::list<RGWOLHInfo*>& o);
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(RGWOLHInfo)
-
-struct RGWOLHPendingInfo {
-  ceph::real_time time;
-
-  RGWOLHPendingInfo() {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(time, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(1, bl);
-     decode(time, bl);
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(RGWOLHPendingInfo)
-
-struct RGWUsageBatch {
-  std::map<ceph::real_time, rgw_usage_log_entry> m;
-
-  void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) {
-    bool exists = m.find(t) != m.end();
-    *account = !exists;
-    m[t].aggregate(entry);
-  }
-};
-
-struct RGWCloneRangeInfo {
-  rgw_obj src;
-  off_t src_ofs;
-  off_t dst_ofs;
-  uint64_t len;
-};
-
-class RGWFetchObjFilter {
-public:
-  virtual ~RGWFetchObjFilter() {}
-
-  virtual int filter(CephContext *cct,
-                     const rgw_obj_key& source_key,
-                     const RGWBucketInfo& dest_bucket_info,
-                     std::optional<rgw_placement_rule> dest_placement_rule,
-                     const std::map<std::string, bufferlist>& obj_attrs,
-                     std::optional<rgw_user> *poverride_owner,
-                     const rgw_placement_rule **prule) = 0;
-};
-
-class RGWFetchObjFilter_Default : public RGWFetchObjFilter {
-protected:
-  rgw_placement_rule dest_rule;
-public:
-  RGWFetchObjFilter_Default() {}
-
-  int filter(CephContext *cct,
-             const rgw_obj_key& source_key,
-             const RGWBucketInfo& dest_bucket_info,
-             std::optional<rgw_placement_rule> dest_placement_rule,
-             const std::map<std::string, bufferlist>& obj_attrs,
-             std::optional<rgw_user> *poverride_owner,
-             const rgw_placement_rule **prule) override;
-};
-
-struct RGWObjStateManifest {
-  RGWObjState state;
-  std::optional<RGWObjManifest> manifest;
-};
-
-class RGWObjectCtx {
-  rgw::sal::Driver* driver;
-  ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx");
-
-  std::map<rgw_obj, RGWObjStateManifest> objs_state;
-public:
-  explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {}
-  RGWObjectCtx(RGWObjectCtx& _o) {
-    std::unique_lock wl{lock};
-    this->driver = _o.driver;
-    this->objs_state = _o.objs_state;
-  }
-
-  rgw::sal::Driver* get_driver() {
-    return driver;
-  }
-
-  RGWObjStateManifest *get_state(const rgw_obj& obj);
-
-  void set_compressed(const rgw_obj& obj);
-  void set_atomic(rgw_obj& obj);
-  void set_prefetch_data(const rgw_obj& obj);
-  void invalidate(const rgw_obj& obj);
-};
-
-
-struct RGWRawObjState {
-  rgw_raw_obj obj;
-  bool has_attrs{false};
-  bool exists{false};
-  uint64_t size{0};
-  ceph::real_time mtime;
-  uint64_t epoch{0};
-  bufferlist obj_tag;
-  bool has_data{false};
-  bufferlist data;
-  bool prefetch_data{false};
-  uint64_t pg_ver{0};
-
-  /* important! don't forget to update copy constructor */
-
-  RGWObjVersionTracker objv_tracker;
-
-  std::map<std::string, bufferlist> attrset;
-  RGWRawObjState() {}
-  RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) {
-    has_attrs = rhs.has_attrs;
-    exists = rhs.exists;
-    size = rhs.size;
-    mtime = rhs.mtime;
-    epoch = rhs.epoch;
-    if (rhs.obj_tag.length()) {
-      obj_tag = rhs.obj_tag;
-    }
-    has_data = rhs.has_data;
-    if (rhs.data.length()) {
-      data = rhs.data;
-    }
-    prefetch_data = rhs.prefetch_data;
-    pg_ver = rhs.pg_ver;
-    objv_tracker = rhs.objv_tracker;
-  }
-};
-
-struct RGWPoolIterCtx {
-  librados::IoCtx io_ctx;
-  librados::NObjectIterator iter;
-};
-
-struct RGWListRawObjsCtx {
-  bool initialized;
-  RGWPoolIterCtx iter_ctx;
-
-  RGWListRawObjsCtx() : initialized(false) {}
-};
-
-struct objexp_hint_entry {
-  std::string tenant;
-  std::string bucket_name;
-  std::string bucket_id;
-  rgw_obj_key obj_key;
-  ceph::real_time exp_time;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(bucket_name, bl);
-    encode(bucket_id, bl);
-    encode(obj_key, bl);
-    encode(exp_time, bl);
-    encode(tenant, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
-    DECODE_START(2, bl);
-    decode(bucket_name, bl);
-    decode(bucket_id, bl);
-    decode(obj_key, bl);
-    decode(exp_time, bl);
-    if (struct_v >= 2) {
-      decode(tenant, bl);
-    } else {
-      tenant.clear();
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  static void generate_test_instances(std::list<objexp_hint_entry*>& o);
-};
-WRITE_CLASS_ENCODER(objexp_hint_entry)
-
-class RGWMetaSyncStatusManager;
-class RGWDataSyncStatusManager;
-class RGWCoroutinesManagerRegistry;
-
-class RGWGetDirHeader_CB;
-class RGWGetUserHeader_CB;
-namespace rgw { namespace sal {
-  class RadosStore;
-  class MPRadosSerializer;
-  class LCRadosSerializer;
-} }
-
-class RGWAsyncRadosProcessor;
-
-template <class T>
-class RGWChainedCacheImpl;
-
-struct bucket_info_entry {
-  RGWBucketInfo info;
-  real_time mtime;
-  std::map<std::string, bufferlist> attrs;
-};
-
-struct tombstone_entry;
-
-template <class K, class V>
-class lru_map;
-using tombstone_cache_t = lru_map<rgw_obj, tombstone_entry>;
-
-class RGWIndexCompletionManager;
-
-class RGWRados
-{
-  friend class RGWGC;
-  friend class RGWMetaNotifier;
-  friend class RGWDataNotifier;
-  friend class RGWObjectExpirer;
-  friend class RGWMetaSyncProcessorThread;
-  friend class RGWDataSyncProcessorThread;
-  friend class RGWReshard;
-  friend class RGWBucketReshard;
-  friend class RGWBucketReshardLock;
-  friend class BucketIndexLockGuard;
-  friend class rgw::sal::MPRadosSerializer;
-  friend class rgw::sal::LCRadosSerializer;
-  friend class rgw::sal::RadosStore;
-
-  /** Open the pool used as root for this gateway */
-  int open_root_pool_ctx(const DoutPrefixProvider *dpp);
-  int open_gc_pool_ctx(const DoutPrefixProvider *dpp);
-  int open_lc_pool_ctx(const DoutPrefixProvider *dpp);
-  int open_objexp_pool_ctx(const DoutPrefixProvider *dpp);
-  int open_reshard_pool_ctx(const DoutPrefixProvider *dpp);
-  int open_notif_pool_ctx(const DoutPrefixProvider *dpp);
-
-  int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx&  io_ctx,
-                   bool mostly_omap);
-
-
-  ceph::mutex lock = ceph::make_mutex("rados_timer_lock");
-  SafeTimer *timer;
-
-  rgw::sal::RadosStore* driver = nullptr;
-  RGWGC *gc = nullptr;
-  RGWLC *lc;
-  RGWObjectExpirer *obj_expirer;
-  bool use_gc_thread;
-  bool use_lc_thread;
-  bool quota_threads;
-  bool run_sync_thread;
-  bool run_reshard_thread;
-
-  RGWMetaNotifier *meta_notifier;
-  RGWDataNotifier *data_notifier;
-  RGWMetaSyncProcessorThread *meta_sync_processor_thread;
-  RGWSyncTraceManager *sync_tracer = nullptr;
-  std::map<rgw_zone_id, RGWDataSyncProcessorThread *> data_sync_processor_threads;
-
-  boost::optional<rgw::BucketTrimManager> bucket_trim;
-  RGWSyncLogTrimThread *sync_log_trimmer{nullptr};
-
-  ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock");
-  ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock");
-
-  librados::IoCtx root_pool_ctx;      // .rgw
-
-  double inject_notify_timeout_probability = 0;
-  unsigned max_notify_retries = 0;
-
-  friend class RGWWatcher;
-
-  ceph::mutex bucket_id_lock = ceph::make_mutex("rados_bucket_id");
-
-  // This field represents the number of bucket index object shards
-  uint32_t bucket_index_max_shards;
-
-  std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y);
-
-  int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref);
-  int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref);
-  int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
-  uint64_t max_bucket_id;
-
-  int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx,
-                          RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
-                          RGWObjState *olh_state, RGWObjState **target_state,
-                          RGWObjManifest **target_manifest, optional_yield y);
-  int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
-                         bool follow_olh, optional_yield y, bool assume_noent = false);
-  int append_atomic_test(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
-                         librados::ObjectOperation& op, RGWObjState **state,
-                        RGWObjManifest** pmanifest, optional_yield y);
-  
-  int update_placement_map();
-  int store_bucket_info(RGWBucketInfo& info, std::map<std::string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive);
-
-  void remove_rgw_head_obj(librados::ObjectWriteOperation& op);
-  void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist);
-  void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type);
-protected:
-  CephContext *cct;
-
-  librados::Rados rados;
-
-  using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl<bucket_info_entry>;
-  RGWChainedCacheImpl_bucket_info_entry *binfo_cache;
-
-  tombstone_cache_t *obj_tombstone_cache;
-
-  librados::IoCtx gc_pool_ctx;        // .rgw.gc
-  librados::IoCtx lc_pool_ctx;        // .rgw.lc
-  librados::IoCtx objexp_pool_ctx;
-  librados::IoCtx reshard_pool_ctx;
-  librados::IoCtx notif_pool_ctx;     // .rgw.notif
-
-  bool pools_initialized;
-
-  RGWQuotaHandler *quota_handler;
-
-  RGWCoroutinesManagerRegistry *cr_registry;
-
-  RGWSyncModuleInstanceRef sync_module;
-  bool writeable_zone{false};
-
-  RGWIndexCompletionManager *index_completion_manager{nullptr};
-
-  bool use_cache{false};
-  bool use_gc{true};
-  bool use_datacache{false};
-
-  int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx);
-public:
-  RGWRados(): timer(NULL),
-               gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
-               run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL),
-               data_notifier(NULL), meta_sync_processor_thread(NULL),
-               bucket_index_max_shards(0),
-               max_bucket_id(0), cct(NULL),
-               binfo_cache(NULL), obj_tombstone_cache(nullptr),
-               pools_initialized(false),
-               quota_handler(NULL),
-               cr_registry(NULL),
-               pctl(&ctl),
-               reshard(NULL) {}
-
-  RGWRados& set_use_cache(bool status) {
-    use_cache = status;
-    return *this;
-  }
-
-  RGWRados& set_use_gc(bool status) {
-    use_gc = status;
-    return *this;
-  }
-
-  RGWRados& set_use_datacache(bool status) {
-    use_datacache = status;
-    return *this;
-  }
-
-  bool get_use_datacache() {
-    return use_datacache;
-  }
-
-  RGWLC *get_lc() {
-    return lc;
-  }
-
-  RGWGC *get_gc() {
-    return gc;
-  }
-
-  RGWRados& set_run_gc_thread(bool _use_gc_thread) {
-    use_gc_thread = _use_gc_thread;
-    return *this;
-  }
-
-  RGWRados& set_run_lc_thread(bool _use_lc_thread) {
-    use_lc_thread = _use_lc_thread;
-    return *this;
-  }
-
-  RGWRados& set_run_quota_threads(bool _run_quota_threads) {
-    quota_threads = _run_quota_threads;
-    return *this;
-  }
-
-  RGWRados& set_run_sync_thread(bool _run_sync_thread) {
-    run_sync_thread = _run_sync_thread;
-    return *this;
-  }
-
-  RGWRados& set_run_reshard_thread(bool _run_reshard_thread) {
-    run_reshard_thread = _run_reshard_thread;
-    return *this;
-  }
-
-  librados::IoCtx* get_lc_pool_ctx() {
-    return &lc_pool_ctx;
-  }
-
-  librados::IoCtx& get_notif_pool_ctx() {
-    return notif_pool_ctx;
-  }
-
-  void set_context(CephContext *_cct) {
-    cct = _cct;
-  }
-  void set_store(rgw::sal::RadosStore* _driver) {
-    driver = _driver;
-  }
-
-  RGWServices svc;
-  RGWCtl ctl;
-
-  RGWCtl *pctl{nullptr};
-
-  /**
-   * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we
-   * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed.
-   */
-  std::string host_id;
-
-  RGWReshard *reshard;
-  std::shared_ptr<RGWReshardWait> reshard_wait;
-
-  virtual ~RGWRados() = default;
-
-  tombstone_cache_t *get_tombstone_cache() {
-    return obj_tombstone_cache;
-  }
-  const RGWSyncModuleInstanceRef& get_sync_module() {
-    return sync_module;
-  }
-  RGWSyncTraceManager *get_sync_tracer() {
-    return sync_tracer;
-  }
-
-  int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment);
-  void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size);
-  int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
-  int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
-
-  uint32_t get_max_bucket_shards() {
-    return RGWSI_BucketIndex_RADOS::shards_max();
-  }
-
-
-  int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
-
-  int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx);
-  int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max,
-                            RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
-                            bool *is_truncated);
-  int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max,
-                       RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
-                       bool *is_truncated);
-  std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
-
-  CephContext *ctx() { return cct; }
-  /** do all necessary setup of the storage device */
-  int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) {
-    set_context(_cct);
-    return init_begin(dpp);
-  }
-  /** Initialize the RADOS instance and prepare to do other ops */
-  int init_svc(bool raw, const DoutPrefixProvider *dpp);
-  int init_ctl(const DoutPrefixProvider *dpp);
-  virtual int init_rados();
-  int init_begin(const DoutPrefixProvider *dpp);
-  int init_complete(const DoutPrefixProvider *dpp);
-  void finalize();
-
-  int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map<std::string, std::string>& meta);
-  int update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status);
-
-  /// list logs
-  int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle);
-  int log_list_next(RGWAccessHandle handle, std::string *name);
-
-  /// remove log
-  int log_remove(const DoutPrefixProvider *dpp, const std::string& name);
-
-  /// show log
-  int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle);
-  int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry);
-
-  // log bandwidth info
-  int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info);
-  int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
-                 uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map<rgw_user_bucket,
-                rgw_usage_log_entry>& usage);
-  int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch);
-  int clear_usage(const DoutPrefixProvider *dpp);
-
-  int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool);
-
-  void create_bucket_id(std::string *bucket_id);
-
-  bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool);
-  bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj);
-
-  int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
-                   const std::string& zonegroup_id,
-                   const rgw_placement_rule& placement_rule,
-                   const std::string& swift_ver_location,
-                   const RGWQuotaInfo * pquota_info,
-                   std::map<std::string,bufferlist>& attrs,
-                   RGWBucketInfo& bucket_info,
-                   obj_version *pobjv,
-                   obj_version *pep_objv,
-                   ceph::real_time creation_time,
-                   rgw_bucket *master_bucket,
-                   uint32_t *master_num_shards,
-                   optional_yield y,
-                    const DoutPrefixProvider *dpp,
-                   bool exclusive = true);
-
-  RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; }
-
-  struct BucketShard {
-    RGWRados *store;
-    rgw_bucket bucket;
-    int shard_id;
-    RGWSI_RADOS::Obj bucket_obj;
-
-    explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
-    int init(const rgw_bucket& _bucket, const rgw_obj& obj,
-             RGWBucketInfo* out, const DoutPrefixProvider *dpp);
-    int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
-    int init(const DoutPrefixProvider *dpp,
-            const RGWBucketInfo& bucket_info,
-            const rgw::bucket_index_layout_generation& index, int sid);
-
-    friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) {
-      out << "BucketShard:{ bucket=" << bs.bucket <<
-       ", shard_id=" << bs.shard_id <<
-       ", bucket_ojb=" << bs.bucket_obj << "}";
-      return out;
-    }
-  };
-
-  class Object {
-    RGWRados *store;
-    rgw::sal::Bucket* bucket;
-    RGWObjectCtx& ctx;
-    rgw::sal::Object* obj;
-
-    BucketShard bs;
-
-    RGWObjState *state;
-    RGWObjManifest *manifest;
-
-    bool versioning_disabled;
-
-    bool bs_initialized;
-
-    const rgw_placement_rule *pmeta_placement_rule;
-
-  protected:
-    int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false);
-    void invalidate_state();
-
-    int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag,
-                                    const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y);
-    int complete_atomic_modification(const DoutPrefixProvider *dpp);
-
-  public:
-    Object(RGWRados *_store, rgw::sal::Bucket* _bucket, RGWObjectCtx& _ctx, rgw::sal::Object* _obj) : store(_store), bucket(_bucket),
-                                                                                               ctx(_ctx), obj(_obj), bs(store),
-                                                                                               state(NULL), manifest(nullptr), versioning_disabled(false),
-                                                                                               bs_initialized(false),
-                                                                                               pmeta_placement_rule(nullptr) {}
-
-    RGWRados *get_store() { return store; }
-    rgw_obj get_obj() { return obj->get_obj(); }
-    RGWObjectCtx& get_ctx() { return ctx; }
-    RGWBucketInfo& get_bucket_info() { return bucket->get_info(); }
-    const std::string& get_instance() { return obj->get_instance(); }
-    rgw::sal::Object* get_target() { return obj; }
-    int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y);
-
-    int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
-      if (!bs_initialized) {
-        int r =
-         bs.init(bucket->get_key(), obj->get_obj(), nullptr /* no RGWBucketInfo */, dpp);
-        if (r < 0) {
-          return r;
-        }
-        bs_initialized = true;
-      }
-      *pbs = &bs;
-      return 0;
-    }
-
-    void set_versioning_disabled(bool status) {
-      versioning_disabled = status;
-    }
-
-    bool versioning_enabled() {
-      return (!versioning_disabled && bucket->versioning_enabled());
-    }
-
-    void set_meta_placement_rule(const rgw_placement_rule *p) {
-        pmeta_placement_rule = p;
-    }
-
-    const rgw_placement_rule& get_meta_placement_rule() {
-        return pmeta_placement_rule ? *pmeta_placement_rule : bucket->get_placement_rule();
-    }
-
-    struct Read {
-      RGWRados::Object *source;
-
-      struct GetObjState {
-        std::map<rgw_pool, librados::IoCtx> io_ctxs;
-        rgw_pool cur_pool;
-        librados::IoCtx *cur_ioctx{nullptr};
-        rgw_obj obj;
-        rgw_raw_obj head_obj;
-      } state;
-      
-      struct ConditionParams {
-        const ceph::real_time *mod_ptr;
-        const ceph::real_time *unmod_ptr;
-        bool high_precision_time;
-        uint32_t mod_zone_id;
-        uint64_t mod_pg_ver;
-        const char *if_match;
-        const char *if_nomatch;
-        
-        ConditionParams() : 
-                 mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
-                 if_match(NULL), if_nomatch(NULL) {}
-      } conds;
-
-      struct Params {
-        ceph::real_time *lastmod;
-        uint64_t *obj_size;
-        std::map<std::string, bufferlist> *attrs;
-        rgw_obj *target_obj;
-
-        Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
-                target_obj(nullptr) {}
-      } params;
-
-      explicit Read(RGWRados::Object *_source) : source(_source) {}
-
-      int prepare(optional_yield y, const DoutPrefixProvider *dpp);
-      static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
-      int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp);
-      int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y);
-      int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y);
-    };
-
-    struct Write {
-      RGWRados::Object *target;
-      
-      struct MetaParams {
-        ceph::real_time *mtime;
-        std::map<std::string, bufferlist>* rmattrs;
-        const bufferlist *data;
-        RGWObjManifest *manifest;
-        const std::string *ptag;
-        std::list<rgw_obj_index_key> *remove_objs;
-        ceph::real_time set_mtime;
-        rgw_user owner;
-        RGWObjCategory category;
-        int flags;
-        const char *if_match;
-        const char *if_nomatch;
-        std::optional<uint64_t> olh_epoch;
-        ceph::real_time delete_at;
-        bool canceled;
-        const std::string *user_data;
-        rgw_zone_set *zones_trace;
-        bool modify_tail;
-        bool completeMultipart;
-        bool appendable;
-
-        MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
-                 remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
-                 if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
-                 modify_tail(false),  completeMultipart(false), appendable(false) {}
-      } meta;
-
-      explicit Write(RGWRados::Object *_target) : target(_target) {}
-
-      int _do_write_meta(const DoutPrefixProvider *dpp,
-                     uint64_t size, uint64_t accounted_size,
-                     std::map<std::string, bufferlist>& attrs,
-                     bool modify_tail, bool assume_noent,
-                     void *index_op, optional_yield y);
-      int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
-                     std::map<std::string, bufferlist>& attrs, optional_yield y);
-      int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive);
-      const req_state* get_req_state() {
-        return nullptr;  /* XXX dang Only used by LTTng, and it handles null anyway */
-      }
-    };
-
-    struct Delete {
-      RGWRados::Object *target;
-
-      struct DeleteParams {
-        rgw_user bucket_owner;
-        int versioning_status; // versioning flags defined in enum RGWBucketFlags
-        ACLOwner obj_owner;    // needed for creation of deletion marker
-        uint64_t olh_epoch;
-        std::string marker_version_id;
-        uint32_t bilog_flags;
-        std::list<rgw_obj_index_key> *remove_objs;
-        ceph::real_time expiration_time;
-        ceph::real_time unmod_since;
-        ceph::real_time mtime; /* for setting delete marker mtime */
-        bool high_precision_time;
-        rgw_zone_set *zones_trace;
-       bool abortmp;
-       uint64_t parts_accounted_size;
-
-        DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
-      } params;
-
-      struct DeleteResult {
-        bool delete_marker;
-        std::string version_id;
-
-        DeleteResult() : delete_marker(false) {}
-      } result;
-      
-      explicit Delete(RGWRados::Object *_target) : target(_target) {}
-
-      int delete_obj(optional_yield y, const DoutPrefixProvider *dpp);
-    };
-
-    struct Stat {
-      RGWRados::Object *source;
-
-      struct Result {
-        rgw_obj obj;
-       std::optional<RGWObjManifest> manifest;
-        uint64_t size{0};
-       struct timespec mtime {};
-        std::map<std::string, bufferlist> attrs;
-      } result;
-
-      struct State {
-        librados::IoCtx io_ctx;
-        librados::AioCompletion *completion;
-        int ret;
-
-        State() : completion(NULL), ret(0) {}
-      } state;
-
-
-      explicit Stat(RGWRados::Object *_source) : source(_source) {}
-
-      int stat_async(const DoutPrefixProvider *dpp);
-      int wait(const DoutPrefixProvider *dpp);
-      int stat();
-    private:
-      int finish(const DoutPrefixProvider *dpp);
-    };
-  };
-
-  class Bucket {
-    RGWRados *store;
-    RGWBucketInfo bucket_info;
-    rgw_bucket& bucket;
-    int shard_id;
-
-  public:
-    Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket),
-                                                            shard_id(RGW_NO_SHARD) {}
-    RGWRados *get_store() { return store; }
-    rgw_bucket& get_bucket() { return bucket; }
-    RGWBucketInfo& get_bucket_info() { return bucket_info; }
-
-    int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp);
-
-    int get_shard_id() { return shard_id; }
-    void set_shard_id(int id) {
-      shard_id = id;
-    }
-
-    class UpdateIndex {
-      RGWRados::Bucket *target;
-      std::string optag;
-      rgw_obj obj;
-      uint16_t bilog_flags{0};
-      BucketShard bs;
-      bool bs_initialized{false};
-      bool blind;
-      bool prepared{false};
-      rgw_zone_set *zones_trace{nullptr};
-
-      int init_bs(const DoutPrefixProvider *dpp) {
-        int r =
-         bs.init(target->get_bucket(), obj, &target->bucket_info, dpp);
-        if (r < 0) {
-          return r;
-        }
-        bs_initialized = true;
-        return 0;
-      }
-
-      void invalidate_bs() {
-        bs_initialized = false;
-      }
-
-      int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call);
-    public:
-
-      UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj),
-                                                              bs(target->get_store()) {
-                                                                blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless);
-                                                              }
-
-      int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
-        if (!bs_initialized) {
-          int r = init_bs(dpp);
-          if (r < 0) {
-            return r;
-          }
-        }
-        *pbs = &bs;
-        return 0;
-      }
-
-      void set_bilog_flags(uint16_t flags) {
-        bilog_flags = flags;
-      }
-      
-      void set_zones_trace(rgw_zone_set *_zones_trace) {
-        zones_trace = _zones_trace;
-      }
-
-      int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y);
-      int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size,
-                   uint64_t accounted_size, ceph::real_time& ut,
-                   const std::string& etag, const std::string& content_type,
-                   const std::string& storage_class,
-                   bufferlist *acl_bl, RGWObjCategory category,
-                  std::list<rgw_obj_index_key> *remove_objs, const std::string *user_data = nullptr, bool appendable = false);
-      int complete_del(const DoutPrefixProvider *dpp,
-                       int64_t poolid, uint64_t epoch,
-                       ceph::real_time& removed_mtime, /* mtime of removed object */
-                       std::list<rgw_obj_index_key> *remove_objs);
-      int cancel(const DoutPrefixProvider *dpp,
-                 std::list<rgw_obj_index_key> *remove_objs);
-
-      const std::string *get_optag() { return &optag; }
-
-      bool is_prepared() { return prepared; }
-    }; // class UpdateIndex
-
-    class List {
-    protected:
-      // absolute maximum number of objects that
-      // list_objects_(un)ordered can return
-      static constexpr int64_t bucket_list_objects_absolute_max = 25000;
-
-      RGWRados::Bucket *target;
-      rgw_obj_key next_marker;
-
-      int list_objects_ordered(const DoutPrefixProvider *dpp,
-                               int64_t max,
-                              std::vector<rgw_bucket_dir_entry> *result,
-                              std::map<std::string, bool> *common_prefixes,
-                              bool *is_truncated,
-                               optional_yield y);
-      int list_objects_unordered(const DoutPrefixProvider *dpp,
-                                 int64_t max,
-                                std::vector<rgw_bucket_dir_entry> *result,
-                                std::map<std::string, bool> *common_prefixes,
-                                bool *is_truncated,
-                                 optional_yield y);
-
-    public:
-
-      struct Params {
-        std::string prefix;
-        std::string delim;
-        rgw_obj_key marker;
-        rgw_obj_key end_marker;
-        std::string ns;
-        bool enforce_ns;
-        RGWAccessListFilter* access_list_filter;
-       RGWBucketListNameFilter force_check_filter;
-        bool list_versions;
-       bool allow_unordered;
-
-        Params() :
-         enforce_ns(true),
-         access_list_filter(nullptr),
-         list_versions(false),
-         allow_unordered(false)
-       {}
-      } params;
-
-      explicit List(RGWRados::Bucket *_target) : target(_target) {}
-
-      int list_objects(const DoutPrefixProvider *dpp, int64_t max,
-                      std::vector<rgw_bucket_dir_entry> *result,
-                      std::map<std::string, bool> *common_prefixes,
-                      bool *is_truncated,
-                       optional_yield y) {
-       if (params.allow_unordered) {
-         return list_objects_unordered(dpp, max, result, common_prefixes,
-                                       is_truncated, y);
-       } else {
-         return list_objects_ordered(dpp, max, result, common_prefixes,
-                                     is_truncated, y);
-       }
-      }
-      rgw_obj_key& get_next_marker() {
-        return next_marker;
-      }
-    }; // class List
-  }; // class Bucket
-
-  int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
-                               RGWBucketInfo& bucket_info,
-                               const std::string& obj_prefix,
-                               const std::string& obj_delim,
-                               std::function<int(const rgw_bucket_dir_entry&)> handler);
-
-  bool swift_versioning_enabled(rgw::sal::Bucket* bucket) const;
-
-  int swift_versioning_copy(RGWObjectCtx& obj_ctx,              /* in/out */
-                            const rgw_user& user,               /* in */
-                            rgw::sal::Bucket* bucket,        /* in */
-                            rgw::sal::Object* obj,           /* in */
-                            const DoutPrefixProvider *dpp,      /* in/out */ 
-                            optional_yield y);                  /* in */                
-  int swift_versioning_restore(RGWObjectCtx& obj_ctx,           /* in/out */
-                               const rgw_user& user,            /* in */
-                               rgw::sal::Bucket* bucket,     /* in */
-                               rgw::sal::Object* obj,        /* in */
-                               bool& restored,                 /* out */
-                               const DoutPrefixProvider *dpp);     /* in/out */                
-  int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
-                              RGWObjState *astate,
-                              std::map<std::string, bufferlist>& src_attrs,
-                              RGWRados::Object::Read& read_op,
-                              const rgw_user& user_id,
-                              rgw::sal::Object* dest_obj,
-                              ceph::real_time *mtime);
-
-  enum AttrsMod {
-    ATTRSMOD_NONE    = 0,
-    ATTRSMOD_REPLACE = 1,
-    ATTRSMOD_MERGE   = 2
-  };
-
-  D3nDataCache* d3n_data_cache{nullptr};
-
-  int rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y);
-
-  int stat_remote_obj(const DoutPrefixProvider *dpp,
-               RGWObjectCtx& obj_ctx,
-               const rgw_user& user_id,
-               req_info *info,
-               const rgw_zone_id& source_zone,
-               rgw::sal::Object* src_obj,
-               const RGWBucketInfo *src_bucket_info,
-               real_time *src_mtime,
-               uint64_t *psize,
-               const real_time *mod_ptr,
-               const real_time *unmod_ptr,
-               bool high_precision_time,
-               const char *if_match,
-               const char *if_nomatch,
-               std::map<std::string, bufferlist> *pattrs,
-               std::map<std::string, std::string> *pheaders,
-               std::string *version_id,
-               std::string *ptag,
-               std::string *petag);
-
-  int fetch_remote_obj(RGWObjectCtx& obj_ctx,
-                       const rgw_user& user_id,
-                       req_info *info,
-                       const rgw_zone_id& source_zone,
-                       rgw::sal::Object* dest_obj,
-                       rgw::sal::Object* src_obj,
-                      rgw::sal::Bucket* dest_bucket,
-                      rgw::sal::Bucket* src_bucket,
-                      std::optional<rgw_placement_rule> dest_placement,
-                       ceph::real_time *src_mtime,
-                       ceph::real_time *mtime,
-                       const ceph::real_time *mod_ptr,
-                       const ceph::real_time *unmod_ptr,
-                       bool high_precision_time,
-                       const char *if_match,
-                       const char *if_nomatch,
-                       AttrsMod attrs_mod,
-                       bool copy_if_newer,
-                       rgw::sal::Attrs& attrs,
-                       RGWObjCategory category,
-                       std::optional<uint64_t> olh_epoch,
-                      ceph::real_time delete_at,
-                       std::string *ptag,
-                       std::string *petag,
-                       void (*progress_cb)(off_t, void *),
-                       void *progress_data,
-                       const DoutPrefixProvider *dpp,
-                       RGWFetchObjFilter *filter,
-                       rgw_zone_set *zones_trace= nullptr,
-                       std::optional<uint64_t>* bytes_transferred = 0);
-  /**
-   * Copy an object.
-   * dest_obj: the object to copy into
-   * src_obj: the object to copy from
-   * attrs: usage depends on attrs_mod parameter
-   * attrs_mod: the modification mode of the attrs, may have the following values:
-   *            ATTRSMOD_NONE - the attributes of the source object will be
-   *                            copied without modifications, attrs parameter is ignored;
-   *            ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
-   *                               parameter, source object attributes are not copied;
-   *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
-   *                             are overwritten by values contained in attrs parameter.
-   * Returns: 0 on success, -ERR# otherwise.
-   */
-  int copy_obj(RGWObjectCtx& obj_ctx,
-               const rgw_user& user_id,
-               req_info *info,
-               const rgw_zone_id& source_zone,
-               rgw::sal::Object* dest_obj,
-               rgw::sal::Object* src_obj,
-               rgw::sal::Bucket* dest_bucket,
-               rgw::sal::Bucket* src_bucket,
-               const rgw_placement_rule& dest_placement,
-               ceph::real_time *src_mtime,
-               ceph::real_time *mtime,
-               const ceph::real_time *mod_ptr,
-               const ceph::real_time *unmod_ptr,
-               bool high_precision_time,
-               const char *if_match,
-               const char *if_nomatch,
-               AttrsMod attrs_mod,
-               bool copy_if_newer,
-               std::map<std::string, bufferlist>& attrs,
-               RGWObjCategory category,
-               uint64_t olh_epoch,
-              ceph::real_time delete_at,
-               std::string *version_id,
-               std::string *ptag,
-               std::string *petag,
-               void (*progress_cb)(off_t, void *),
-               void *progress_data,
-               const DoutPrefixProvider *dpp,
-               optional_yield y);
-
-  int copy_obj_data(RGWObjectCtx& obj_ctx,
-               rgw::sal::Bucket* bucket,
-               const rgw_placement_rule& dest_placement,
-              RGWRados::Object::Read& read_op, off_t end,
-               rgw::sal::Object* dest_obj,
-              ceph::real_time *mtime,
-              ceph::real_time set_mtime,
-               std::map<std::string, bufferlist>& attrs,
-               uint64_t olh_epoch,
-              ceph::real_time delete_at,
-               std::string *petag,
-               const DoutPrefixProvider *dpp,
-               optional_yield y);
-  
-  int transition_obj(RGWObjectCtx& obj_ctx,
-                     rgw::sal::Bucket* bucket,
-                     rgw::sal::Object& obj,
-                     const rgw_placement_rule& placement_rule,
-                     const real_time& mtime,
-                     uint64_t olh_epoch,
-                     const DoutPrefixProvider *dpp,
-                     optional_yield y);
-
-  int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
-
-  /**
-   * Delete a bucket.
-   * bucket: the name of the bucket to delete
-   * Returns 0 on success, -ERR# otherwise.
-   */
-  int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true);
-
-  void wakeup_meta_sync_shards(std::set<int>& shard_ids);
-
-  void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries);
-
-  RGWMetaSyncStatusManager* get_meta_sync_manager();
-  RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone);
-
-  int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp);
-  int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp);
-  int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended);
-
-  /** Delete an object.*/
-  int delete_obj(rgw::sal::Driver* driver,
-                const DoutPrefixProvider *dpp,
-                const RGWBucketInfo& bucket_owner,
-                const rgw_obj& src_obj,
-                int versioning_status,  // versioning flags defined in enum RGWBucketFlags
-                uint16_t bilog_flags = 0,
-                const ceph::real_time& expiration_time = ceph::real_time(),
-                rgw_zone_set *zones_trace = nullptr);
-  int delete_obj(const DoutPrefixProvider *dpp,
-                const RGWBucketInfo& bucket_owner,
-                rgw::sal::Object* src_obj,
-                int versioning_status,  // versioning flags defined in enum RGWBucketFlags
-                uint16_t bilog_flags = 0,
-                const ceph::real_time& expiration_time = ceph::real_time(),
-                rgw_zone_set *zones_trace = nullptr);
-
-  int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
-
-  /** Remove an object from the bucket index */
-  int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp);
-
-  /**
-   * Set an attr on an object.
-   * bucket: name of the bucket holding the object
-   * obj: name of the object to set the attr on
-   * name: the attr to set
-   * bl: the contents of the attr
-   * Returns: 0 on success, -ERR# otherwise.
-   */
-  int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl);
-
-  int set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
-                        std::map<std::string, bufferlist>& attrs,
-                        std::map<std::string, bufferlist>* rmattrs,
-                        optional_yield y);
-
-  int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
-                    bool follow_olh, optional_yield y, bool assume_noent = false);
-  int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) {
-    return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y);
-  }
-
-  using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t,
-                                 off_t, bool, RGWObjState*, void*);
-
-  int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info,
-                  rgw::sal::Object* obj, off_t ofs, off_t end,
-                  uint64_t max_chunk_size, iterate_obj_cb cb, void *arg,
-                  optional_yield y);
-
-  int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op);
-
-  virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
-                         const rgw_raw_obj& read_obj, off_t obj_ofs,
-                         off_t read_ofs, off_t len, bool is_head_obj,
-                         RGWObjState *astate, void *arg);
-
-  /**
-   * a simple object read without keeping state
-   */
-
-  int raw_obj_stat(const DoutPrefixProvider *dpp,
-                   rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch,
-                   std::map<std::string, bufferlist> *attrs, bufferlist *first_chunk,
-                   RGWObjVersionTracker *objv_tracker, optional_yield y);
-
-  int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
-  int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
-
-  int guard_reshard(const DoutPrefixProvider *dpp,
-                    BucketShard *bs,
-                   const rgw_obj& obj_instance,
-                   RGWBucketInfo& bucket_info,
-                   std::function<int(BucketShard *)> call);
-  int block_while_resharding(RGWRados::BucketShard *bs,
-                             const rgw_obj& obj_instance,
-                            RGWBucketInfo& bucket_info,
-                             optional_yield y,
-                             const DoutPrefixProvider *dpp);
-
-  void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op);
-  int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
-  int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
-  int bucket_index_link_olh(const DoutPrefixProvider *dpp,
-                            RGWBucketInfo& bucket_info, RGWObjState& olh_state,
-                            const rgw_obj& obj_instance, bool delete_marker,
-                            const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
-                            uint64_t olh_epoch,
-                            ceph::real_time unmod_since, bool high_precision_time,
-                            rgw_zone_set *zones_trace = nullptr,
-                            bool log_data_change = false);
-  int bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
-                                   RGWBucketInfo& bucket_info,
-                                   const rgw_obj& obj_instance,
-                                   const std::string& op_tag, const std::string& olh_tag,
-                                   uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
-  int bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
-                                RGWBucketInfo& bucket_info, RGWObjState& state,
-                                const rgw_obj& obj_instance, uint64_t ver_marker,
-                                std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
-  int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver);
-  int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance);
-  int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj,
-                    bufferlist& obj_tag, std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
-                    uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr);
-  int update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace = nullptr);
-  int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
-              uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
-              optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
-  int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
-                 const rgw_obj& obj);
-  int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
-                          uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr);
-
-  void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& pending_entries, std::map<std::string, bufferlist> *rm_pending_entries);
-  int remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map<std::string, bufferlist>& pending_attrs);
-  int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target);
-  int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh);
-
-  void gen_rand_obj_instance_name(rgw_obj_key *target_key);
-  void gen_rand_obj_instance_name(rgw_obj *target);
-
-  int update_containers_stats(std::map<std::string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp);
-  int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl);
-
-public:
-  void set_atomic(void *ctx, rgw_obj& obj) {
-    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
-    rctx->set_atomic(obj);
-  }
-  void set_prefetch_data(void *ctx, const rgw_obj& obj) {
-    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
-    rctx->set_prefetch_data(obj);
-  }
-  void set_compressed(void *ctx, const rgw_obj& obj) {
-    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
-    rctx->set_compressed(obj);
-  }
-  int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner);
-  int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver,
-      std::map<RGWObjCategory, RGWStorageStats>& stats, std::string *max_marker, bool* syncstopped = NULL);
-  int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb);
-
-  int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map<std::string, bufferlist> *pattrs, const DoutPrefixProvider *dpp);
-  /* xxx dang obj_ctx -> svc */
-  int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
-  int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
-
-  static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry);
-
-  int get_bucket_info(RGWServices *svc,
-                     const std::string& tenant_name, const std::string& bucket_name,
-                     RGWBucketInfo& info,
-                     ceph::real_time *pmtime, optional_yield y,
-                      const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *pattrs = NULL);
-
-  // Returns 0 on successful refresh. Returns error code if there was
-  // an error or the version stored on the OSD is the same as that
-  // presented in the BucketInfo structure.
-  //
-  int try_refresh_bucket_info(RGWBucketInfo& info,
-                             ceph::real_time *pmtime,
-                              const DoutPrefixProvider *dpp,
-                             std::map<std::string, bufferlist> *pattrs = nullptr);
-
-  int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
-                            std::map<std::string, bufferlist> *pattrs, bool create_entry_point,
-                             const DoutPrefixProvider *dpp);
-
-  int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr);
-  int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch,
-                          rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
-  int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent,
-                           RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
-  int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj,
-                           ceph::real_time& removed_mtime, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
-  int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj,
-                              std::list<rgw_obj_index_key> *remove_objs,
-                              uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
-  int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout);
-
-  using ent_map_t =
-    boost::container::flat_map<std::string, rgw_bucket_dir_entry>;
-
-  int cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
-                              RGWBucketInfo& bucket_info,
-                              const rgw::bucket_index_layout_generation& idx_layout,
-                              const int shard_id,
-                             const rgw_obj_index_key& start_after,
-                             const std::string& prefix,
-                             const std::string& delimiter,
-                             const uint32_t num_entries,
-                             const bool list_versions,
-                             const uint16_t exp_factor, // 0 means ignore
-                             ent_map_t& m,
-                             bool* is_truncated,
-                             bool* cls_filtered,
-                             rgw_obj_index_key *last_entry,
-                              optional_yield y,
-                             RGWBucketListNameFilter force_check_filter = {});
-  int cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
-                                RGWBucketInfo& bucket_info,
-                                const rgw::bucket_index_layout_generation& idx_layout,
-                                int shard_id,
-                               const rgw_obj_index_key& start_after,
-                               const std::string& prefix,
-                               uint32_t num_entries,
-                               bool list_versions,
-                               std::vector<rgw_bucket_dir_entry>& ent_list,
-                               bool *is_truncated,
-                               rgw_obj_index_key *last_entry,
-                                optional_yield y,
-                               RGWBucketListNameFilter force_check_filter = {});
-  int cls_bucket_head(const DoutPrefixProvider *dpp,
-                     const RGWBucketInfo& bucket_info,
-                     const rgw::bucket_index_layout_generation& idx_layout,
-                     int shard_id, std::vector<rgw_bucket_dir_header>& headers,
-                     std::map<int, std::string> *bucket_instance_ids = NULL);
-  int cls_bucket_head_async(const DoutPrefixProvider *dpp,
-                           const RGWBucketInfo& bucket_info,
-                           const rgw::bucket_index_layout_generation& idx_layout,
-                           int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
-  int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
-  int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
-  int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
-  void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
-  int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
-  int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
-  int bi_list(const DoutPrefixProvider *dpp,
-             const RGWBucketInfo& bucket_info,
-             int shard_id,
-             const std::string& filter_obj,
-             const std::string& marker,
-             uint32_t max,
-             std::list<rgw_cls_bi_entry> *entries,
-             bool *is_truncated);
-  int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
-  int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max,
-              std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
-  int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs);
-
-  int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info);
-  int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
-                             uint64_t end_epoch, uint32_t max_entries, std::string& read_iter,
-                            std::map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated);
-  int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
-                             uint64_t end_epoch);
-  int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid);
-
-  int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id);
-
-  int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id);
-  int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id);
-
-  void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain);
-  std::tuple<int, std::optional<cls_rgw_obj_chain>> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag);
-  void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag);
-  int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op);
-  int gc_aio_operate(const std::string& oid, librados::AioCompletion *c,
-                     librados::ObjectWriteOperation *op);
-  int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl);
-
-  int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
-  int process_gc(bool expired_only);
-  bool process_expire_objects(const DoutPrefixProvider *dpp);
-  int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y);
-
-  int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
-  int list_lc_progress(std::string& marker, uint32_t max_entries,
-                      std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
-                      int& index);
-
-  int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
-                         std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
-                         std::map<RGWObjCategory, RGWStorageStats> *calculated_stats);
-  int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info);
-  int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
-  int remove_objs_from_index(const DoutPrefixProvider *dpp,
-                            RGWBucketInfo& bucket_info,
-                            const std::list<rgw_obj_index_key>& oid_list);
-  int move_rados_obj(const DoutPrefixProvider *dpp,
-                     librados::IoCtx& src_ioctx,
-                    const std::string& src_oid, const std::string& src_locator,
-                    librados::IoCtx& dst_ioctx,
-                    const std::string& dst_oid, const std::string& dst_locator);
-  int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key);
-  int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
-                           rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y);
-
-  int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
-                  RGWQuota& quota, uint64_t obj_size,
-                 optional_yield y, bool check_size_only = false);
-
-  int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
-                         uint64_t num_objs, const DoutPrefixProvider *dpp);
-
-  int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards);
-
-  uint64_t instance_id();
-
-  librados::Rados* get_rados_handle();
-
-  int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list<librados::AioCompletion *>& handles);
-  int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate,
-                     std::list<librados::AioCompletion *>& handles, bool keep_index_consistent,
-                     optional_yield y);
-
- private:
-  /**
-   * Check the actual on-disk state of the object specified
-   * by list_state, and fill in the time and size of object.
-   * Then append any changes to suggested_updates for
-   * the rgw class' dir_suggest_changes function.
-   *
-   * Note that this can maul list_state; don't use it afterwards. Also
-   * it expects object to already be filled in from list_state; it only
-   * sets the size and mtime.
-   *
-   * Returns 0 on success, -ENOENT if the object doesn't exist on disk,
-   * and -errno on other failures. (-ENOENT is not a failure, and it
-   * will encode that info as a suggested update.)
-   */
-  int check_disk_state(const DoutPrefixProvider *dpp,
-                       librados::IoCtx io_ctx,
-                       RGWBucketInfo& bucket_info,
-                       rgw_bucket_dir_entry& list_state,
-                       rgw_bucket_dir_entry& object,
-                       bufferlist& suggested_updates,
-                       optional_yield y);
-
-  /**
-   * Init pool iteration
-   * pool: pool to use for the ctx initialization
-   * ctx: context object to use for the iteration
-   * Returns: 0 on success, -ERR# otherwise.
-   */
-  int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx);
-
-  /**
-   * Init pool iteration
-   * pool: pool to use
-   * cursor: position to start iteration
-   * ctx: context object to use for the iteration
-   * Returns: 0 on success, -ERR# otherwise.
-   */
-  int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx);
-
-  /**
-   * Get pool iteration position
-   * ctx: context object to use for the iteration
-   * Returns: std::string representation of position
-   */
-  std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
-
-  /**
-   * Iterate over pool return object names, use optional filter
-   * ctx: iteration context, initialized with pool_iterate_begin()
-   * num: max number of objects to return
-   * objs: a vector that the results will append into
-   * is_truncated: if not NULL, will hold true iff iteration is complete
-   * filter: if not NULL, will be used to filter returned objects
-   * Returns: 0 on success, -ERR# otherwise.
-   */
-  int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num,
-                  std::vector<rgw_bucket_dir_entry>& objs,
-                   bool *is_truncated, RGWAccessListFilter *filter);
-
-  uint64_t next_bucket_id();
-
-  /**
-   * This is broken out to facilitate unit testing.
-   */
-  static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
-                                                    uint32_t num_shards);
-};
-
-
-struct get_obj_data {
-  RGWRados* rgwrados;
-  RGWGetDataCB* client_cb = nullptr;
-  rgw::Aio* aio;
-  uint64_t offset; // next offset to write to client
-  rgw::AioResultList completed; // completed read results, sorted by offset
-  optional_yield yield;
-
-  get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio,
-               uint64_t offset, optional_yield yield)
-               : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
-  ~get_obj_data() {
-    if (rgwrados->get_use_datacache()) {
-      const std::lock_guard l(d3n_get_data.d3n_lock);
-    }
-  }
-
-  D3nGetObjData d3n_get_data;
-  std::atomic_bool d3n_bypass_cache_write{false};
-
-  int flush(rgw::AioResultList&& results);
-
-  void cancel() {
-    // wait for all completions to drain and ignore the results
-    aio->drain();
-  }
-
-  int drain() {
-    auto c = aio->wait();
-    while (!c.empty()) {
-      int r = flush(std::move(c));
-      if (r < 0) {
-        cancel();
-        return r;
-      }
-      c = aio->wait();
-    }
-    return flush(std::move(c));
-  }
-};
-
-
-#endif
diff --git a/src/rgw/rgw_realm_reloader.h b/src/rgw/rgw_realm_reloader.h

index 0a28cff18603e30d6396a155038142feb9bd7f30..25082a2e490d78cdaf8ac4f3a8bdaa0d854a392f 100644 (file)
--- a/src/rgw/rgw_realm_reloader.h
+++ b/src/rgw/rgw_realm_reloader.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_REALM_RELOADER_H
-#define RGW_REALM_RELOADER_H
+#pragma once
  
  #include "rgw_realm_watcher.h"
  #include "common/Cond.h"
@@ -63,5 +62,3 @@ class RGWRealmReloader : public RGWRealmWatcher::Watcher {
    ceph::condition_variable cond; //< to signal reload() after an invalid realm config
    C_Reload* reload_scheduled; //< reload() context if scheduled
  };
-
-#endif // RGW_REALM_RELOADER_H
diff --git a/src/rgw/rgw_realm_watcher.h b/src/rgw/rgw_realm_watcher.h

index b2e3ac6b9d649fb369693921075ff579c55ebc6f..2a0c0d0769900f57d6d322eae2745da3b0c36d58 100644 (file)
--- a/src/rgw/rgw_realm_watcher.h
+++ b/src/rgw/rgw_realm_watcher.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_REALM_WATCHER_H
-#define RGW_REALM_WATCHER_H
+#pragma once
  
  #include "include/rados/librados.hpp"
  #include "include/ceph_assert.h"
@@ -65,5 +64,3 @@ class RGWRealmWatcher : public librados::WatchCtx2 {
  
    std::map<RGWRealmNotify, Watcher&> watchers;
  };
-
-#endif // RGW_REALM_WATCHER_H
diff --git a/src/rgw/rgw_request.h b/src/rgw/rgw_request.h

index ed54dca595ec45850271e58087799b87e24b7d6b..cd05f51c942422d9a1dae8aab946cb0bb693bf7e 100644 (file)
--- a/src/rgw/rgw_request.h
+++ b/src/rgw/rgw_request.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_REQUEST_H
-#define RGW_REQUEST_H
+#pragma once
  
  #include "rgw_common.h"
  #include "rgw_acl.h"
@@ -39,5 +38,3 @@ RGWLoadGenRequest(uint64_t req_id, const std::string& _m, const std::string& _r,
         : RGWRequest(req_id), method(_m), resource(_r), content_length(_cl),
                 fail_flag(ff) {}
  };
-
-#endif /* RGW_REQUEST_H */
diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc

deleted file mode 100644 (file)

index b2dec7a..0000000
--- a/src/rgw/rgw_reshard.cc
+++ /dev/null
@@ -1,1407 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <limits>
-#include <sstream>
-
-#include "rgw_zone.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_reshard.h"
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/lock/cls_lock_client.h"
-#include "common/errno.h"
-#include "common/ceph_json.h"
-
-#include "common/dout.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_tier_rados.h"
-#include "services/svc_bilog_rados.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-const string reshard_oid_prefix = "reshard.";
-const string reshard_lock_name = "reshard_process";
-const string bucket_instance_lock_name = "bucket_instance_lock";
-
-/* All primes up to 2000 used to attempt to make dynamic sharding use
- * a prime numbers of shards. Note: this list also includes 1 for when
- * 1 shard is the most appropriate, even though 1 is not prime.
- */
-const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
-  1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
-  67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
-  139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
-  223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
-  293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
-  383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
-  463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
-  569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
-  647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
-  743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
-  839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
-  941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
-  1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
-  1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
-  1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
-  1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
-  1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
-  1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
-  1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
-  1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
-  1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
-  1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
-  1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
-  1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
-};
-
-class BucketReshardShard {
-  rgw::sal::RadosStore* store;
-  const RGWBucketInfo& bucket_info;
-  int shard_id;
-  RGWRados::BucketShard bs;
-  vector<rgw_cls_bi_entry> entries;
-  map<RGWObjCategory, rgw_bucket_category_stats> stats;
-  deque<librados::AioCompletion *>& aio_completions;
-  uint64_t max_aio_completions;
-  uint64_t reshard_shard_batch_size;
-
-  int wait_next_completion() {
-    librados::AioCompletion *c = aio_completions.front();
-    aio_completions.pop_front();
-
-    c->wait_for_complete();
-
-    int ret = c->get_return_value();
-    c->release();
-
-    if (ret < 0) {
-      derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    return 0;
-  }
-
-  int get_completion(librados::AioCompletion **c) {
-    if (aio_completions.size() >= max_aio_completions) {
-      int ret = wait_next_completion();
-      if (ret < 0) {
-        return ret;
-      }
-    }
-
-    *c = librados::Rados::aio_create_completion(nullptr, nullptr);
-    aio_completions.push_back(*c);
-
-    return 0;
-  }
-
-public:
-  BucketReshardShard(const DoutPrefixProvider *dpp,
-                    rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info,
-                     const rgw::bucket_index_layout_generation& index,
-                     int shard_id, deque<librados::AioCompletion *>& _completions) :
-    store(_store), bucket_info(_bucket_info), shard_id(shard_id),
-    bs(store->getRados()), aio_completions(_completions)
-  {
-    bs.init(dpp, bucket_info, index, shard_id);
-
-    max_aio_completions =
-      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
-    reshard_shard_batch_size =
-      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
-  }
-
-  int get_shard_id() const {
-    return shard_id;
-  }
-
-  int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
-                const rgw_bucket_category_stats& entry_stats) {
-    entries.push_back(entry);
-    if (account) {
-      rgw_bucket_category_stats& target = stats[category];
-      target.num_entries += entry_stats.num_entries;
-      target.total_size += entry_stats.total_size;
-      target.total_size_rounded += entry_stats.total_size_rounded;
-      target.actual_size += entry_stats.actual_size;
-    }
-    if (entries.size() >= reshard_shard_batch_size) {
-      int ret = flush();
-      if (ret < 0) {
-        return ret;
-      }
-    }
-
-    return 0;
-  }
-
-  int flush() {
-    if (entries.size() == 0) {
-      return 0;
-    }
-
-    librados::ObjectWriteOperation op;
-    for (auto& entry : entries) {
-      store->getRados()->bi_put(op, bs, entry);
-    }
-    cls_rgw_bucket_update_stats(op, false, stats);
-
-    librados::AioCompletion *c;
-    int ret = get_completion(&c);
-    if (ret < 0) {
-      return ret;
-    }
-    ret = bs.bucket_obj.aio_operate(c, &op);
-    if (ret < 0) {
-      derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-    entries.clear();
-    stats.clear();
-    return 0;
-  }
-
-  int wait_all_aio() {
-    int ret = 0;
-    while (!aio_completions.empty()) {
-      int r = wait_next_completion();
-      if (r < 0) {
-        ret = r;
-      }
-    }
-    return ret;
-  }
-}; // class BucketReshardShard
-
-
-class BucketReshardManager {
-  rgw::sal::RadosStore *store;
-  deque<librados::AioCompletion *> completions;
-  vector<BucketReshardShard> target_shards;
-
-public:
-  BucketReshardManager(const DoutPrefixProvider *dpp,
-                      rgw::sal::RadosStore *_store,
-                      const RGWBucketInfo& bucket_info,
-                       const rgw::bucket_index_layout_generation& target)
-    : store(_store)
-  {
-    const int num_shards = target.layout.normal.num_shards;
-    target_shards.reserve(num_shards);
-    for (int i = 0; i < num_shards; ++i) {
-      target_shards.emplace_back(dpp, store, bucket_info, target, i, completions);
-    }
-  }
-
-  ~BucketReshardManager() {
-    for (auto& shard : target_shards) {
-      int ret = shard.wait_all_aio();
-      if (ret < 0) {
-        ldout(store->ctx(), 20) << __func__ <<
-         ": shard->wait_all_aio() returned ret=" << ret << dendl;
-      }
-    }
-  }
-
-  int add_entry(int shard_index,
-                rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
-                const rgw_bucket_category_stats& entry_stats) {
-    int ret = target_shards[shard_index].add_entry(entry, account, category,
-                                                  entry_stats);
-    if (ret < 0) {
-      derr << "ERROR: target_shards.add_entry(" << entry.idx <<
-       ") returned error: " << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    return 0;
-  }
-
-  int finish() {
-    int ret = 0;
-    for (auto& shard : target_shards) {
-      int r = shard.flush();
-      if (r < 0) {
-        derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
-        ret = r;
-      }
-    }
-    for (auto& shard : target_shards) {
-      int r = shard.wait_all_aio();
-      if (r < 0) {
-        derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
-        ret = r;
-      }
-    }
-    target_shards.clear();
-    return ret;
-  }
-}; // class BucketReshardManager
-
-RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store,
-                                  const RGWBucketInfo& _bucket_info,
-                                  const std::map<std::string, bufferlist>& _bucket_attrs,
-                                  RGWBucketReshardLock* _outer_reshard_lock) :
-  store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
-  reshard_lock(store, bucket_info, true),
-  outer_reshard_lock(_outer_reshard_lock)
-{ }
-
-// sets reshard status of bucket index shards for the current index layout
-static int set_resharding_status(const DoutPrefixProvider *dpp,
-                                rgw::sal::RadosStore* store,
-                                const RGWBucketInfo& bucket_info,
-                                 cls_rgw_reshard_status status)
-{
-  cls_rgw_bucket_instance_entry instance_entry;
-  instance_entry.set_status(status);
-
-  int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
-                 << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-static int remove_old_reshard_instance(rgw::sal::RadosStore* store,
-                                       const rgw_bucket& bucket,
-                                       const DoutPrefixProvider* dpp)
-{
-  RGWBucketInfo info;
-  int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr,
-                                                      nullptr, null_yield, dpp);
-  if (r < 0) {
-    return r;
-  }
-
-  // delete its shard objects (ignore errors)
-  store->svc()->bi->clean_index(dpp, info, info.layout.current_index);
-  // delete the bucket instance metadata
-  return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp);
-}
-
-// initialize the new bucket index shard objects
-static int init_target_index(rgw::sal::RadosStore* store,
-                             RGWBucketInfo& bucket_info,
-                             const rgw::bucket_index_layout_generation& index,
-                             const DoutPrefixProvider* dpp)
-{
-  int ret = store->svc()->bi->init_index(dpp, bucket_info, index);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize "
-       "target index shard objects: " << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-
-  if (!bucket_info.datasync_flag_enabled()) {
-    // if bucket sync is disabled, disable it on each of the new shards too
-    auto log = rgw::log_layout_from_index(0, index);
-    ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable "
-          "bucket sync on the target index shard objects: "
-          << cpp_strerror(ret) << dendl;
-      store->svc()->bi->clean_index(dpp, bucket_info, index);
-      return ret;
-    }
-  }
-
-  return ret;
-}
-
-// initialize a target index layout, create its bucket index shard objects, and
-// write the target layout to the bucket instance metadata
-static int init_target_layout(rgw::sal::RadosStore* store,
-                              RGWBucketInfo& bucket_info,
-                             std::map<std::string, bufferlist>& bucket_attrs,
-                              ReshardFaultInjector& fault,
-                              uint32_t new_num_shards,
-                              const DoutPrefixProvider* dpp)
-{
-  auto prev = bucket_info.layout; // make a copy for cleanup
-  const auto current = prev.current_index;
-
-  // initialize a new normal target index layout generation
-  rgw::bucket_index_layout_generation target;
-  target.layout.type = rgw::BucketIndexType::Normal;
-  target.layout.normal.num_shards = new_num_shards;
-  target.gen = current.gen + 1;
-
-  if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
-    // backward-compatible cleanup of old reshards, where the target was in a
-    // different bucket instance
-    if (!bucket_info.new_bucket_instance_id.empty()) {
-      rgw_bucket new_bucket = bucket_info.bucket;
-      new_bucket.bucket_id = bucket_info.new_bucket_instance_id;
-      ldout(store->ctx(), 10) << __func__ << " removing target bucket instance "
-          "from a previous reshard attempt" << dendl;
-      // ignore errors
-      remove_old_reshard_instance(store, new_bucket, dpp);
-    }
-    bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
-  }
-
-  if (bucket_info.layout.target_index) {
-    // a previous reshard failed or stalled, and its reshard lock dropped
-    ldpp_dout(dpp, 10) << __func__ << " removing existing target index "
-        "objects from a previous reshard attempt" << dendl;
-    // delete its existing shard objects (ignore errors)
-    store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index);
-    // don't reuse this same generation in the new target layout, in case
-    // something is still trying to operate on its shard objects
-    target.gen = bucket_info.layout.target_index->gen + 1;
-  }
-
-  // create the index shard objects
-  int ret = init_target_index(store, bucket_info, target, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  // retry in case of racing writes to the bucket instance metadata
-  static constexpr auto max_retries = 10;
-  int tries = 0;
-  do {
-    // update resharding state
-    bucket_info.layout.target_index = target;
-    bucket_info.layout.resharding = rgw::BucketReshardState::InProgress;
-
-    if (ret = fault.check("set_target_layout");
-        ret == 0) { // no fault injected, write the bucket instance metadata
-      ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
-                                                        real_time(), &bucket_attrs, dpp);
-    } else if (ret == -ECANCELED) {
-      fault.clear(); // clear the fault so a retry can succeed
-    }
-
-    if (ret == -ECANCELED) {
-      // racing write detected, read the latest bucket info and try again
-      int ret2 = store->getRados()->get_bucket_instance_info(
-          bucket_info.bucket, bucket_info,
-          nullptr, &bucket_attrs, null_yield, dpp);
-      if (ret2 < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
-            "bucket info: " << cpp_strerror(ret2) << dendl;
-        ret = ret2;
-        break;
-      }
-
-      // check that we're still in the reshard state we started in
-      if (bucket_info.layout.resharding != rgw::BucketReshardState::None ||
-          bucket_info.layout.current_index != current) {
-        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
-            "another reshard" << dendl;
-        break;
-      }
-
-      prev = bucket_info.layout; // update the copy
-    }
-    ++tries;
-  } while (ret == -ECANCELED && tries < max_retries);
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write "
-        "target index layout to bucket info: " << cpp_strerror(ret) << dendl;
-
-    bucket_info.layout = std::move(prev);  // restore in-memory layout
-
-    // delete the target shard objects (ignore errors)
-    store->svc()->bi->clean_index(dpp, bucket_info, target);
-    return ret;
-  }
-  return 0;
-} // init_target_layout
-
-// delete the bucket index shards associated with the target layout and remove
-// it from the bucket instance metadata
-static int revert_target_layout(rgw::sal::RadosStore* store,
-                                RGWBucketInfo& bucket_info,
-                               std::map<std::string, bufferlist>& bucket_attrs,
-                                ReshardFaultInjector& fault,
-                                const DoutPrefixProvider* dpp)
-{
-  auto prev = bucket_info.layout; // make a copy for cleanup
-
-  // remove target index shard objects
-  int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove "
-        "target index with: " << cpp_strerror(ret) << dendl;
-    ret = 0; // non-fatal error
-  }
-
-  // retry in case of racing writes to the bucket instance metadata
-  static constexpr auto max_retries = 10;
-  int tries = 0;
-  do {
-    // clear target_index and resharding state
-    bucket_info.layout.target_index = std::nullopt;
-    bucket_info.layout.resharding = rgw::BucketReshardState::None;
-
-    if (ret = fault.check("revert_target_layout");
-        ret == 0) { // no fault injected, revert the bucket instance metadata
-      ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
-                                                        real_time(),
-                                                        &bucket_attrs, dpp);
-    } else if (ret == -ECANCELED) {
-      fault.clear(); // clear the fault so a retry can succeed
-    }
-
-    if (ret == -ECANCELED) {
-      // racing write detected, read the latest bucket info and try again
-      int ret2 = store->getRados()->get_bucket_instance_info(
-          bucket_info.bucket, bucket_info,
-          nullptr, &bucket_attrs, null_yield, dpp);
-      if (ret2 < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
-            "bucket info: " << cpp_strerror(ret2) << dendl;
-        ret = ret2;
-        break;
-      }
-
-      // check that we're still in the reshard state we started in
-      if (bucket_info.layout.resharding == rgw::BucketReshardState::None) {
-        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
-            "reshard cancel" << dendl;
-        return -ECANCELED;
-      }
-      if (bucket_info.layout.current_index != prev.current_index ||
-          bucket_info.layout.target_index != prev.target_index) {
-        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
-            "another reshard" << dendl;
-        return -ECANCELED;
-      }
-
-      prev = bucket_info.layout; // update the copy
-    }
-    ++tries;
-  } while (ret == -ECANCELED && tries < max_retries);
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear "
-        "target index layout in bucket info: " << cpp_strerror(ret) << dendl;
-
-    bucket_info.layout = std::move(prev);  // restore in-memory layout
-    return ret;
-  }
-  return 0;
-} // remove_target_layout
-
-static int init_reshard(rgw::sal::RadosStore* store,
-                        RGWBucketInfo& bucket_info,
-                       std::map<std::string, bufferlist>& bucket_attrs,
-                        ReshardFaultInjector& fault,
-                        uint32_t new_num_shards,
-                        const DoutPrefixProvider *dpp)
-{
-  int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (ret = fault.check("block_writes");
-      ret == 0) { // no fault injected, block writes to the current index shards
-    ret = set_resharding_status(dpp, store, bucket_info,
-                                cls_rgw_reshard_status::IN_PROGRESS);
-  }
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause "
-        "writes to the current index: " << cpp_strerror(ret) << dendl;
-    // clean up the target layout (ignore errors)
-    revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
-    return ret;
-  }
-  return 0;
-} // init_reshard
-
-static int cancel_reshard(rgw::sal::RadosStore* store,
-                          RGWBucketInfo& bucket_info,
-                         std::map<std::string, bufferlist>& bucket_attrs,
-                          ReshardFaultInjector& fault,
-                          const DoutPrefixProvider *dpp)
-{
-  // unblock writes to the current index shard objects
-  int ret = set_resharding_status(dpp, store, bucket_info,
-                                  cls_rgw_reshard_status::NOT_RESHARDING);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
-        "writes to current index objects: " << cpp_strerror(ret) << dendl;
-    ret = 0; // non-fatal error
-  }
-
-  if (bucket_info.layout.target_index) {
-    return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
-  }
-  // there is nothing to revert
-  return 0;
-} // cancel_reshard
-
-static int commit_target_layout(rgw::sal::RadosStore* store,
-                                RGWBucketInfo& bucket_info,
-                                std::map<std::string, bufferlist>& bucket_attrs,
-                                ReshardFaultInjector& fault,
-                                const DoutPrefixProvider *dpp)
-{
-  auto& layout = bucket_info.layout;
-  const auto next_log_gen = layout.logs.empty() ? 1 :
-      layout.logs.back().gen + 1;
-
-  if (!store->svc()->zone->need_to_log_data()) {
-    // if we're not syncing data, we can drop any existing logs
-    layout.logs.clear();
-  }
-
-  // use the new index layout as current
-  ceph_assert(layout.target_index);
-  layout.current_index = std::move(*layout.target_index);
-  layout.target_index = std::nullopt;
-  layout.resharding = rgw::BucketReshardState::None;
-  // add the in-index log layout
-  layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index));
-
-  int ret = fault.check("commit_target_layout");
-  if (ret == 0) { // no fault injected, write the bucket instance metadata
-    ret = store->getRados()->put_bucket_instance_info(
-        bucket_info, false, real_time(), &bucket_attrs, dpp);
-  } else if (ret == -ECANCELED) {
-    fault.clear(); // clear the fault so a retry can succeed
-  }
-  return ret;
-} // commit_target_layout
-
-static int commit_reshard(rgw::sal::RadosStore* store,
-                          RGWBucketInfo& bucket_info,
-                         std::map<std::string, bufferlist>& bucket_attrs,
-                          ReshardFaultInjector& fault,
-                          const DoutPrefixProvider *dpp)
-{
-  auto prev = bucket_info.layout; // make a copy for cleanup
-
-  // retry in case of racing writes to the bucket instance metadata
-  static constexpr auto max_retries = 10;
-  int tries = 0;
-  int ret = 0;
-  do {
-    ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
-    if (ret == -ECANCELED) {
-      // racing write detected, read the latest bucket info and try again
-      int ret2 = store->getRados()->get_bucket_instance_info(
-          bucket_info.bucket, bucket_info,
-          nullptr, &bucket_attrs, null_yield, dpp);
-      if (ret2 < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
-            "bucket info: " << cpp_strerror(ret2) << dendl;
-        ret = ret2;
-        break;
-      }
-
-      // check that we're still in the reshard state we started in
-      if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
-        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
-            "reshard cancel" << dendl;
-        return -ECANCELED; // whatever canceled us already did the cleanup
-      }
-      if (bucket_info.layout.current_index != prev.current_index ||
-          bucket_info.layout.target_index != prev.target_index) {
-        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
-            "another reshard" << dendl;
-        return -ECANCELED; // whatever canceled us already did the cleanup
-      }
-
-      prev = bucket_info.layout; // update the copy
-    }
-    ++tries;
-  } while (ret == -ECANCELED && tries < max_retries);
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit "
-        "target index layout: " << cpp_strerror(ret) << dendl;
-
-    bucket_info.layout = std::move(prev); // restore in-memory layout
-
-    // unblock writes to the current index shard objects
-    int ret2 = set_resharding_status(dpp, store, bucket_info,
-                                     cls_rgw_reshard_status::NOT_RESHARDING);
-    if (ret2 < 0) {
-      ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
-          "writes to current index objects: " << cpp_strerror(ret2) << dendl;
-      // non-fatal error
-    }
-    return ret;
-  }
-
-  if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() &&
-      prev.current_index.layout.type == rgw::BucketIndexType::Normal) {
-    // write a datalog entry for each shard of the previous index. triggering
-    // sync on the old shards will force them to detect the end-of-log for that
-    // generation, and eventually transition to the next
-    // TODO: use a log layout to support types other than BucketLogType::InIndex
-    for (uint32_t shard_id = 0; shard_id < prev.current_index.layout.normal.num_shards; ++shard_id) {
-      ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id);
-      if (ret < 0) {
-        ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket="
-        << bucket_info.bucket << ", shard_id=" << shard_id << "of generation="
-        << prev.logs.back().gen << ")" << dendl;
-      } // datalog error is not fatal
-    }
-  }
-
-  // check whether the old index objects are still needed for bilogs
-  const auto& logs = bucket_info.layout.logs;
-  auto log = std::find_if(logs.begin(), logs.end(),
-      [&prev] (const rgw::bucket_log_layout_generation& log) {
-        return log.layout.type == rgw::BucketLogType::InIndex
-            && log.layout.in_index.gen == prev.current_index.gen;
-      });
-  if (log == logs.end()) {
-    // delete the index objects (ignore errors)
-    store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index);
-  }
-  return 0;
-} // commit_reshard
-
-int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store,
-                                       RGWBucketInfo& bucket_info,
-                                      std::map<std::string, bufferlist>& bucket_attrs,
-                                       const DoutPrefixProvider* dpp)
-{
-  ReshardFaultInjector no_fault;
-  return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp);
-}
-
-int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp)
-{
-  int ret = reshard_lock.lock(dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
-    ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl;
-    ret = -EINVAL;
-  } else {
-    ret = clear_resharding(store, bucket_info, bucket_attrs, dpp);
-  }
-
-  reshard_lock.unlock();
-  return ret;
-}
-
-RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store,
-                                          const std::string& reshard_lock_oid,
-                                          bool _ephemeral) :
-  store(_store),
-  lock_oid(reshard_lock_oid),
-  ephemeral(_ephemeral),
-  internal_lock(reshard_lock_name)
-{
-  const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
-    "rgw_reshard_bucket_lock_duration");
-  duration = std::chrono::seconds(lock_dur_secs);
-
-#define COOKIE_LEN 16
-  char cookie_buf[COOKIE_LEN + 1];
-  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
-  cookie_buf[COOKIE_LEN] = '\0';
-
-  internal_lock.set_cookie(cookie_buf);
-  internal_lock.set_duration(duration);
-}
-
-int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) {
-  internal_lock.set_must_renew(false);
-
-  int ret;
-  if (ephemeral) {
-    ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
-                                                lock_oid);
-  } else {
-    ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
-  }
-
-  if (ret == -EBUSY) {
-    ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ <<
-      " found lock on " << lock_oid <<
-      " to be held by another RGW process; skipping for now" << dendl;
-    return ret;
-  } else if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ <<
-      " failed to acquire lock on " << lock_oid << ": " <<
-      cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  reset_time(Clock::now());
-
-  return 0;
-}
-
-void RGWBucketReshardLock::unlock() {
-  int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
-  if (ret < 0) {
-    ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
-      " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
-  }
-}
-
-int RGWBucketReshardLock::renew(const Clock::time_point& now) {
-  internal_lock.set_must_renew(true);
-  int ret;
-  if (ephemeral) {
-    ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
-                                                lock_oid);
-  } else {
-    ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
-  }
-  if (ret < 0) { /* expired or already locked by another processor */
-    std::stringstream error_s;
-    if (-ENOENT == ret) {
-      error_s << "ENOENT (lock expired or never initially locked)";
-    } else {
-      error_s << ret << " (" << cpp_strerror(-ret) << ")";
-    }
-    ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
-      lock_oid << " with error " << error_s.str() << dendl;
-    return ret;
-  }
-  internal_lock.set_must_renew(false);
-
-  reset_time(now);
-  ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
-    lock_oid << dendl;
-
-  return 0;
-}
-
-
-int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current,
-                                 const rgw::bucket_index_layout_generation& target,
-                                 int max_entries,
-                                bool verbose,
-                                ostream *out,
-                                Formatter *formatter,
-                                 const DoutPrefixProvider *dpp)
-{
-  if (out) {
-    (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
-    (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
-  }
-
-  /* update bucket info -- in progress*/
-  list<rgw_cls_bi_entry> entries;
-
-  if (max_entries < 0) {
-    ldpp_dout(dpp, 0) << __func__ <<
-      ": can't reshard, negative max_entries" << dendl;
-    return -EINVAL;
-  }
-
-  BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target);
-
-  bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
-
-  if (verbose_json_out) {
-    formatter->open_array_section("entries");
-  }
-
-  uint64_t total_entries = 0;
-
-  if (!verbose_json_out && out) {
-    (*out) << "total entries:";
-  }
-
-  const int num_source_shards = current.layout.normal.num_shards;
-  string marker;
-  for (int i = 0; i < num_source_shards; ++i) {
-    bool is_truncated = true;
-    marker.clear();
-    const std::string null_object_filter; // empty string since we're not filtering by object
-    while (is_truncated) {
-      entries.clear();
-      int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated);
-      if (ret < 0 && ret != -ENOENT) {
-        derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
-
-      for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
-       rgw_cls_bi_entry& entry = *iter;
-       if (verbose_json_out) {
-         formatter->open_object_section("entry");
-
-         encode_json("shard_id", i, formatter);
-         encode_json("num_entry", total_entries, formatter);
-         encode_json("entry", entry, formatter);
-       }
-       total_entries++;
-
-       marker = entry.idx;
-
-       int target_shard_id;
-       cls_rgw_obj_key cls_key;
-       RGWObjCategory category;
-       rgw_bucket_category_stats stats;
-       bool account = entry.get_info(&cls_key, &category, &stats);
-       rgw_obj_key key(cls_key);
-       if (entry.type == BIIndexType::OLH && key.empty()) {
-         // bogus entry created by https://tracker.ceph.com/issues/46456
-         // to fix, skip so it doesn't get include in the new bucket instance
-         total_entries--;
-         ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl;
-         continue;
-       }
-       rgw_obj obj(bucket_info.bucket, key);
-       RGWMPObj mp;
-       if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
-         // place the multipart .meta object on the same shard as its head object
-         obj.index_hash_source = mp.get_key();
-       }
-       ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal,
-                                                    obj.get_hash_object(), &target_shard_id);
-       if (ret < 0) {
-         ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
-         return ret;
-       }
-
-       int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
-
-       ret = target_shards_mgr.add_entry(shard_index, entry, account,
-                                         category, stats);
-       if (ret < 0) {
-         return ret;
-       }
-
-       Clock::time_point now = Clock::now();
-       if (reshard_lock.should_renew(now)) {
-         // assume outer locks have timespans at least the size of ours, so
-         // can call inside conditional
-         if (outer_reshard_lock) {
-           ret = outer_reshard_lock->renew(now);
-           if (ret < 0) {
-             return ret;
-           }
-         }
-         ret = reshard_lock.renew(now);
-         if (ret < 0) {
-           ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl;
-           return ret;
-         }
-       }
-       if (verbose_json_out) {
-         formatter->close_section();
-         formatter->flush(*out);
-       } else if (out && !(total_entries % 1000)) {
-         (*out) << " " << total_entries;
-       }
-      } // entries loop
-    }
-  }
-
-  if (verbose_json_out) {
-    formatter->close_section();
-    formatter->flush(*out);
-  } else if (out) {
-    (*out) << " " << total_entries << std::endl;
-  }
-
-  int ret = target_shards_mgr.finish();
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl;
-    return -EIO;
-  }
-  return 0;
-} // RGWBucketReshard::do_reshard
-
-int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list<cls_rgw_bucket_instance_entry> *status)
-{
-  return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status);
-}
-
-int RGWBucketReshard::execute(int num_shards,
-                              ReshardFaultInjector& fault,
-                              int max_op_entries,
-                              const DoutPrefixProvider *dpp,
-                              bool verbose, ostream *out,
-                              Formatter *formatter,
-                              RGWReshard* reshard_log)
-{
-  // take a reshard lock on the bucket
-  int ret = reshard_lock.lock(dpp);
-  if (ret < 0) {
-    return ret;
-  }
-  // unlock when scope exits
-  auto unlock = make_scope_guard([this] { reshard_lock.unlock(); });
-
-  if (reshard_log) {
-    ret = reshard_log->update(dpp, bucket_info);
-    if (ret < 0) {
-      return ret;
-    }
-  }
-
-  // prepare the target index and add its layout the bucket info
-  ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (ret = fault.check("do_reshard");
-      ret == 0) { // no fault injected, do the reshard
-    ret = do_reshard(bucket_info.layout.current_index,
-                     *bucket_info.layout.target_index,
-                     max_op_entries, verbose, out, formatter, dpp);
-  }
-
-  if (ret < 0) {
-    cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp);
-
-    ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
-        << bucket_info.bucket.name << "\" canceled due to errors" << dendl;
-    return ret;
-  }
-
-  ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
-      << bucket_info.bucket.name << "\" completed successfully" << dendl;
-  return 0;
-} // execute
-
-bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket,
-                                   const RGWSI_Zone* zone_svc)
-{
-  return !zone_svc->need_to_log_data() ||
-      bucket.layout.logs.size() < max_bilog_history;
-}
-
-
-RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out,
-                       Formatter *_formatter) :
-  store(_store), instance_lock(bucket_instance_lock_name),
-  verbose(_verbose), out(_out), formatter(_formatter)
-{
-  num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
-}
-
-string RGWReshard::get_logshard_key(const string& tenant,
-                                   const string& bucket_name)
-{
-  return tenant + ":" + bucket_name;
-}
-
-#define MAX_RESHARD_LOGSHARDS_PRIME 7877
-
-void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
-{
-  string key = get_logshard_key(tenant, bucket_name);
-
-  uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
-  uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
-  sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
-
-  get_logshard_oid(int(sid), oid);
-}
-
-int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
-{
-  if (!store->svc()->zone->can_reshard()) {
-    ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled"  << dendl;
-    return 0;
-  }
-
-  string logshard_oid;
-
-  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
-
-  librados::ObjectWriteOperation op;
-  cls_rgw_reshard_add(op, entry);
-
-  int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info)
-{
-  cls_rgw_reshard_entry entry;
-  entry.bucket_name = bucket_info.bucket.name;
-  entry.bucket_id = bucket_info.bucket.bucket_id;
-  entry.tenant = bucket_info.owner.tenant;
-
-  int ret = get(dpp, entry);
-  if (ret < 0) {
-    return ret;
-  }
-
-  ret = add(dpp, entry);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
-      cpp_strerror(-ret) << dendl;
-  }
-
-  return ret;
-}
-
-
-int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
-{
-  string logshard_oid;
-
-  get_logshard_oid(logshard_num, &logshard_oid);
-
-  int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
-
-  if (ret == -ENOENT) {
-    // these shard objects aren't created until we actually write something to
-    // them, so treat ENOENT as a successful empty listing
-    *is_truncated = false;
-    ret = 0;
-  } else if (ret == -EACCES) {
-    ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
-                      << ". Fix the pool access permissions of your client" << dendl;
-  } else if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid="
-        << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl;
-  }
-
-  return ret;
-}
-
-int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
-{
-  string logshard_oid;
-
-  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
-
-  int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
-  if (ret < 0) {
-    if (ret != -ENOENT) {
-      ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
-       " bucket=" << entry.bucket_name << dendl;
-    }
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry)
-{
-  string logshard_oid;
-
-  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
-
-  librados::ObjectWriteOperation op;
-  cls_rgw_reshard_remove(op, entry);
-
-  int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
-    return ret;
-  }
-
-  return ret;
-}
-
-int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
-{
-  int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWReshardWait::wait(optional_yield y)
-{
-  std::unique_lock lock(mutex);
-
-  if (going_down) {
-    return -ECANCELED;
-  }
-
-  if (y) {
-    auto& context = y.get_io_context();
-    auto& yield = y.get_yield_context();
-
-    Waiter waiter(context);
-    waiters.push_back(waiter);
-    lock.unlock();
-
-    waiter.timer.expires_after(duration);
-
-    boost::system::error_code ec;
-    waiter.timer.async_wait(yield[ec]);
-
-    lock.lock();
-    waiters.erase(waiters.iterator_to(waiter));
-    return -ec.value();
-  }
-
-  cond.wait_for(lock, duration);
-
-  if (going_down) {
-    return -ECANCELED;
-  }
-
-  return 0;
-}
-
-void RGWReshardWait::stop()
-{
-  std::scoped_lock lock(mutex);
-  going_down = true;
-  cond.notify_all();
-  for (auto& waiter : waiters) {
-    // unblock any waiters with ECANCELED
-    waiter.timer.cancel();
-  }
-}
-
-int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
-                              int max_entries, const DoutPrefixProvider *dpp)
-{
-  ldpp_dout(dpp, 20) << __func__ << " resharding " <<
-      entry.bucket_name  << dendl;
-
-  rgw_bucket bucket;
-  RGWBucketInfo bucket_info;
-  std::map<std::string, bufferlist> bucket_attrs;
-
-  int ret = store->getRados()->get_bucket_info(store->svc(),
-                                               entry.tenant,
-                                              entry.bucket_name,
-                                               bucket_info, nullptr,
-                                               null_yield, dpp,
-                                              &bucket_attrs);
-  if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) <<  __func__ <<
-          ": Error in get_bucket_info for bucket " << entry.bucket_name <<
-          ": " << cpp_strerror(-ret) << dendl;
-      if (ret != -ENOENT) {
-        // any error other than ENOENT will abort
-        return ret;
-      }
-    } else {
-      ldpp_dout(dpp, 0) << __func__ <<
-          ": Bucket: " << entry.bucket_name <<
-          " already resharded by someone, skipping " << dendl;
-    }
-
-    // we've encountered a reshard queue entry for an apparently
-    // non-existent bucket; let's try to recover by cleaning up
-    ldpp_dout(dpp, 0) <<  __func__ <<
-        ": removing reshard queue entry for a resharded or non-existent bucket" <<
-        entry.bucket_name << dendl;
-
-    ret = remove(dpp, entry);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-          ": Error removing non-existent bucket " <<
-          entry.bucket_name << " from resharding queue: " <<
-          cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    // we cleaned up, move on to the next entry
-    return 0;
-  }
-
-  if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) {
-    ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not "
-        "eligible for resharding until peer zones finish syncing one "
-        "or more of its old log generations" << dendl;
-    return remove(dpp, entry);
-  }
-
-  RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
-
-  ReshardFaultInjector f; // no fault injected
-  ret = br.execute(entry.new_num_shards, f, max_entries, dpp,
-                   false, nullptr, nullptr, this);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) <<  __func__ <<
-        ": Error during resharding bucket " << entry.bucket_name << ":" <<
-        cpp_strerror(-ret)<< dendl;
-    return ret;
-  }
-
-  ldpp_dout(dpp, 20) << __func__ <<
-      " removing reshard queue entry for bucket " << entry.bucket_name <<
-      dendl;
-
-  ret = remove(dpp, entry);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " <<
-        entry.bucket_name << " from resharding queue: " <<
-        cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp)
-{
-  string marker;
-  bool truncated = true;
-
-  constexpr uint32_t max_entries = 1000;
-
-  string logshard_oid;
-  get_logshard_oid(logshard_num, &logshard_oid);
-
-  RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
-
-  int ret = logshard_lock.lock(dpp);
-  if (ret < 0) { 
-    ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " <<
-      logshard_oid << ", ret = " << ret <<dendl;
-    return ret;
-  }
-  
-  do {
-    std::list<cls_rgw_reshard_entry> entries;
-    ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated);
-    if (ret < 0) {
-      ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" <<
-       logshard_oid << dendl;
-      continue;
-    }
-
-    for(auto& entry: entries) { // logshard entries
-      process_entry(entry, max_entries, dpp);
-      if (ret < 0) {
-        return ret;
-      }
-
-      Clock::time_point now = Clock::now();
-      if (logshard_lock.should_renew(now)) {
-        ret = logshard_lock.renew(now);
-        if (ret < 0) {
-          return ret;
-        }
-      }
-
-      entry.get_key(&marker);
-    } // entry for loop
-  } while (truncated);
-
-  logshard_lock.unlock();
-  return 0;
-}
-
-
-void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
-{
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
-
-  string objname(reshard_oid_prefix);
-  *logshard =  objname + buf;
-}
-
-int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp)
-{
-  int ret = 0;
-
-  for (int i = 0; i < num_logshards; i++) {
-    string logshard;
-    get_logshard_oid(i, &logshard);
-
-    ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl;
-
-    ret = process_single_logshard(i, dpp);
-
-    ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
-  }
-
-  return 0;
-}
-
-bool RGWReshard::going_down()
-{
-  return down_flag;
-}
-
-void RGWReshard::start_processor()
-{
-  worker = new ReshardWorker(store->ctx(), this);
-  worker->create("rgw_reshard");
-}
-
-void RGWReshard::stop_processor()
-{
-  down_flag = true;
-  if (worker) {
-    worker->stop();
-    worker->join();
-  }
-  delete worker;
-  worker = nullptr;
-}
-
-void *RGWReshard::ReshardWorker::entry() {
-  do {
-    utime_t start = ceph_clock_now();
-    reshard->process_all_logshards(this);
-
-    if (reshard->going_down())
-      break;
-
-    utime_t end = ceph_clock_now();
-    end -= start;
-    int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
-
-    if (secs <= end.sec())
-      continue; // next round
-
-    secs -= end.sec();
-
-    std::unique_lock locker{lock};
-    cond.wait_for(locker, std::chrono::seconds(secs));
-  } while (!reshard->going_down());
-
-  return NULL;
-}
-
-void RGWReshard::ReshardWorker::stop()
-{
-  std::lock_guard l{lock};
-  cond.notify_all();
-}
-
-CephContext *RGWReshard::ReshardWorker::get_cct() const
-{
-  return cct;
-}
-
-unsigned RGWReshard::ReshardWorker::get_subsys() const
-{
-  return dout_subsys;
-}
-
-std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const
-{
-  return out << "rgw reshard worker thread: ";
-}
diff --git a/src/rgw/rgw_reshard.h b/src/rgw/rgw_reshard.h

deleted file mode 100644 (file)

index d8a8e49..0000000
--- a/src/rgw/rgw_reshard.h
+++ /dev/null
@@ -1,277 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef RGW_RESHARD_H
-#define RGW_RESHARD_H
-
-#include <vector>
-#include <initializer_list>
-#include <functional>
-#include <iterator>
-#include <algorithm>
-
-#include <boost/intrusive/list.hpp>
-#include <boost/asio/basic_waitable_timer.hpp>
-
-#include "include/common_fwd.h"
-#include "include/rados/librados.hpp"
-#include "common/ceph_time.h"
-#include "common/async/yield_context.h"
-#include "cls/rgw/cls_rgw_types.h"
-#include "cls/lock/cls_lock_client.h"
-
-#include "rgw_common.h"
-#include "common/fault_injector.h"
-
-
-class RGWReshard;
-namespace rgw { namespace sal {
-  class RadosStore;
-} }
-
-using ReshardFaultInjector = FaultInjector<std::string_view>;
-
-class RGWBucketReshardLock {
-  using Clock = ceph::coarse_mono_clock;
-
-  rgw::sal::RadosStore* store;
-  const std::string lock_oid;
-  const bool ephemeral;
-  rados::cls::lock::Lock internal_lock;
-  std::chrono::seconds duration;
-
-  Clock::time_point start_time;
-  Clock::time_point renew_thresh;
-
-  void reset_time(const Clock::time_point& now) {
-    start_time = now;
-    renew_thresh = start_time + duration / 2;
-  }
-
-public:
-  RGWBucketReshardLock(rgw::sal::RadosStore* _store,
-                      const std::string& reshard_lock_oid,
-                      bool _ephemeral);
-  RGWBucketReshardLock(rgw::sal::RadosStore* _store,
-                      const RGWBucketInfo& bucket_info,
-                      bool _ephemeral) :
-    RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
-  {}
-
-  int lock(const DoutPrefixProvider *dpp);
-  void unlock();
-  int renew(const Clock::time_point&);
-
-  bool should_renew(const Clock::time_point& now) const {
-    return now >= renew_thresh;
-  }
-}; // class RGWBucketReshardLock
-
-class RGWBucketReshard {
- public:
-  using Clock = ceph::coarse_mono_clock;
-
- private:
-  rgw::sal::RadosStore *store;
-  RGWBucketInfo bucket_info;
-  std::map<std::string, bufferlist> bucket_attrs;
-
-  RGWBucketReshardLock reshard_lock;
-  RGWBucketReshardLock* outer_reshard_lock;
-
-  // using an initializer_list as an array in contiguous memory
-  // allocated in at once
-  static const std::initializer_list<uint16_t> reshard_primes;
-
-  int do_reshard(const rgw::bucket_index_layout_generation& current,
-                 const rgw::bucket_index_layout_generation& target,
-                 int max_entries,
-                 bool verbose,
-                 std::ostream *os,
-                Formatter *formatter,
-                 const DoutPrefixProvider *dpp);
-public:
-
-  // pass nullptr for the final parameter if no outer reshard lock to
-  // manage
-  RGWBucketReshard(rgw::sal::RadosStore* _store,
-                  const RGWBucketInfo& _bucket_info,
-                  const std::map<std::string, bufferlist>& _bucket_attrs,
-                  RGWBucketReshardLock* _outer_reshard_lock);
-  int execute(int num_shards, ReshardFaultInjector& f,
-              int max_op_entries, const DoutPrefixProvider *dpp,
-              bool verbose = false, std::ostream *out = nullptr,
-              ceph::Formatter *formatter = nullptr,
-             RGWReshard *reshard_log = nullptr);
-  int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
-  int cancel(const DoutPrefixProvider* dpp);
-
-  static int clear_resharding(rgw::sal::RadosStore* store,
-                             RGWBucketInfo& bucket_info,
-                             std::map<std::string, bufferlist>& bucket_attrs,
-                              const DoutPrefixProvider* dpp);
-
-  static uint32_t get_max_prime_shards() {
-    return *std::crbegin(reshard_primes);
-  }
-
-  // returns the prime in our list less than or equal to the
-  // parameter; the lowest value that can be returned is 1
-  static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
-    auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
-                              requested_shards);
-    if (it == reshard_primes.begin()) {
-      return 1;
-    } else {
-      return *(--it);
-    }
-  }
-
-  // returns the prime in our list greater than or equal to the
-  // parameter; if we do not have such a prime, 0 is returned
-  static uint32_t get_prime_shards_greater_or_equal(
-    uint32_t requested_shards)
-  {
-    auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
-                              requested_shards);
-    if (it == reshard_primes.end()) {
-      return 0;
-    } else {
-      return *it;
-    }
-  }
-
-  // returns a preferred number of shards given a calculated number of
-  // shards based on max_dynamic_shards and the list of prime values
-  static uint32_t get_preferred_shards(uint32_t suggested_shards,
-                                      uint32_t max_dynamic_shards) {
-
-    // use a prime if max is within our prime range, otherwise use
-    // specified max
-    const uint32_t absolute_max =
-      max_dynamic_shards >= get_max_prime_shards() ?
-      max_dynamic_shards :
-      get_prime_shards_less_or_equal(max_dynamic_shards);
-
-    // if we can use a prime number, use it, otherwise use suggested;
-    // note get_prime_shards_greater_or_equal will return 0 if no prime in
-    // prime range
-    const uint32_t prime_ish_num_shards =
-      std::max(get_prime_shards_greater_or_equal(suggested_shards),
-              suggested_shards);
-
-    // dynamic sharding cannot reshard more than defined maximum
-    const uint32_t final_num_shards =
-      std::min(prime_ish_num_shards, absolute_max);
-
-    return final_num_shards;
-  }
-
-  const std::map<std::string, bufferlist>& get_bucket_attrs() const {
-    return bucket_attrs;
-  }
-
-  // for multisite, the RGWBucketInfo keeps a history of old log generations
-  // until all peers are done with them. prevent this log history from growing
-  // too large by refusing to reshard the bucket until the old logs get trimmed
-  static constexpr size_t max_bilog_history = 4;
-
-  static bool can_reshard(const RGWBucketInfo& bucket,
-                          const RGWSI_Zone* zone_svc);
-}; // RGWBucketReshard
-
-
-class RGWReshard {
-public:
-    using Clock = ceph::coarse_mono_clock;
-
-private:
-    rgw::sal::RadosStore* store;
-    std::string lock_name;
-    rados::cls::lock::Lock instance_lock;
-    int num_logshards;
-
-    bool verbose;
-    std::ostream *out;
-    Formatter *formatter;
-
-    void get_logshard_oid(int shard_num, std::string *shard);
-protected:
-  class ReshardWorker : public Thread, public DoutPrefixProvider {
-    CephContext *cct;
-    RGWReshard *reshard;
-    ceph::mutex lock = ceph::make_mutex("ReshardWorker");
-    ceph::condition_variable cond;
-
-  public:
-    ReshardWorker(CephContext * const _cct,
-                 RGWReshard * const _reshard)
-      : cct(_cct),
-        reshard(_reshard) {}
-
-    void *entry() override;
-    void stop();
-
-    CephContext *get_cct() const override;
-    unsigned get_subsys() const override;
-    std::ostream& gen_prefix(std::ostream& out) const override;
-  };
-
-  ReshardWorker *worker = nullptr;
-  std::atomic<bool> down_flag = { false };
-
-  std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name);
-  void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid);
-
-public:
-  RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr);
-  int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
-  int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info);
-  int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
-  int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry);
-  int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
-  int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
-
-  /* reshard thread */
-  int process_entry(const cls_rgw_reshard_entry& entry, int max_entries,
-                    const DoutPrefixProvider *dpp);
-  int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
-  int process_all_logshards(const DoutPrefixProvider *dpp);
-  bool going_down();
-  void start_processor();
-  void stop_processor();
-};
-
-class RGWReshardWait {
- public:
-  // the blocking wait uses std::condition_variable::wait_for(), which uses the
-  // std::chrono::steady_clock. use that for the async waits as well
-  using Clock = std::chrono::steady_clock;
- private:
-  const ceph::timespan duration;
-  ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
-  ceph::condition_variable cond;
-
-  struct Waiter : boost::intrusive::list_base_hook<> {
-    using Executor = boost::asio::io_context::executor_type;
-    using Timer = boost::asio::basic_waitable_timer<Clock,
-          boost::asio::wait_traits<Clock>, Executor>;
-    Timer timer;
-    explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
-  };
-  boost::intrusive::list<Waiter> waiters;
-
-  bool going_down{false};
-
-public:
-  RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
-    : duration(duration) {}
-  ~RGWReshardWait() {
-    ceph_assert(going_down);
-  }
-  int wait(optional_yield y);
-  // unblock any threads waiting on reshard
-  void stop();
-};
-
-#endif
diff --git a/src/rgw/rgw_resolve.h b/src/rgw/rgw_resolve.h

index 92e09220cc6a0f22c37ec2fd3ee7598fab431eb7..0428e0a02fca577e06c44e9989b766651b5486dc 100644 (file)
--- a/src/rgw/rgw_resolve.h
+++ b/src/rgw/rgw_resolve.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_RESOLVE_H
-#define CEPH_RGW_RESOLVE_H
+#pragma once
  
  #include "rgw_common.h"
  
@@ -23,5 +22,3 @@ public:
  extern void rgw_init_resolver(void);
  extern void rgw_shutdown_resolver(void);
  extern RGWResolver *rgw_resolver;
-
-#endif
diff --git a/src/rgw/rgw_rest_bucket.cc b/src/rgw/rgw_rest_bucket.cc

deleted file mode 100644 (file)

index ebe4e42..0000000
--- a/src/rgw/rgw_rest_bucket.cc
+++ /dev/null
@@ -1,413 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_op.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_rest_bucket.h"
-#include "rgw_sal.h"
-
-#include "include/str_list.h"
-
-#include "services/svc_sys_obj.h"
-#include "services/svc_zone.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-class RGWOp_Bucket_Info : public RGWRESTOp {
-
-public:
-  RGWOp_Bucket_Info() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_READ);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "get_bucket_info"; }
-};
-
-void RGWOp_Bucket_Info::execute(optional_yield y)
-{
-  RGWBucketAdminOpState op_state;
-
-  bool fetch_stats;
-
-  std::string bucket;
-
-  string uid_str;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-  RESTArgs::get_bool(s, "stats", false, &fetch_stats);
-
-  op_state.set_user_id(uid);
-  op_state.set_bucket_name(bucket);
-  op_state.set_fetch_stats(fetch_stats);
-
-  op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this);
-}
-
-class RGWOp_Get_Policy : public RGWRESTOp {
-
-public:
-  RGWOp_Get_Policy() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_READ);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "get_policy"; }
-};
-
-void RGWOp_Get_Policy::execute(optional_yield y)
-{
-  RGWBucketAdminOpState op_state;
-
-  std::string bucket;
-  std::string object;
-
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-  RESTArgs::get_string(s, "object", object, &object);
-
-  op_state.set_bucket_name(bucket);
-  op_state.set_object(object);
-
-  op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this);
-}
-
-class RGWOp_Check_Bucket_Index : public RGWRESTOp {
-
-public:
-  RGWOp_Check_Bucket_Index() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "check_bucket_index"; }
-};
-
-void RGWOp_Check_Bucket_Index::execute(optional_yield y)
-{
-  std::string bucket;
-
-  bool fix_index;
-  bool check_objects;
-
-  RGWBucketAdminOpState op_state;
-
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-  RESTArgs::get_bool(s, "fix", false, &fix_index);
-  RESTArgs::get_bool(s, "check-objects", false, &check_objects);
-
-  op_state.set_bucket_name(bucket);
-  op_state.set_fix_index(fix_index);
-  op_state.set_check_objects(check_objects);
-
-  op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s);
-}
-
-class RGWOp_Bucket_Link : public RGWRESTOp {
-
-public:
-  RGWOp_Bucket_Link() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "link_bucket"; }
-};
-
-void RGWOp_Bucket_Link::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string bucket;
-  std::string bucket_id;
-  std::string new_bucket_name;
-
-  RGWBucketAdminOpState op_state;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-  RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
-  RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name);
-
-  rgw_user uid(uid_str);
-  op_state.set_user_id(uid);
-  op_state.set_bucket_name(bucket);
-  op_state.set_bucket_id(bucket_id);
-  op_state.set_new_bucket_name(new_bucket_name);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWBucketAdminOp::link(driver, op_state, s);
-}
-
-class RGWOp_Bucket_Unlink : public RGWRESTOp {
-
-public:
-  RGWOp_Bucket_Unlink() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "unlink_bucket"; }
-};
-
-void RGWOp_Bucket_Unlink::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string bucket;
-
-  RGWBucketAdminOpState op_state;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-
-  op_state.set_user_id(uid);
-  op_state.set_bucket_name(bucket);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWBucketAdminOp::unlink(driver, op_state, s);
-}
-
-class RGWOp_Bucket_Remove : public RGWRESTOp {
-
-public:
-  RGWOp_Bucket_Remove() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "remove_bucket"; }
-};
-
-void RGWOp_Bucket_Remove::execute(optional_yield y)
-{
-  std::string bucket_name;
-  bool delete_children;
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-
-  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
-  RESTArgs::get_bool(s, "purge-objects", false, &delete_children);
-
-  /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to
-   * the master.  This user is actually the OP caller, not the bucket owner. */
-  op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl;
-    if (op_ret == -ENOENT) {
-      op_ret = -ERR_NO_SUCH_BUCKET;
-    }
-    return;
-  }
-
-  op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield);
-}
-
-class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
-
-public:
-  RGWOp_Set_Bucket_Quota() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "set_bucket_quota"; }
-};
-
-#define QUOTA_INPUT_MAX_LEN 1024
-
-void RGWOp_Set_Bucket_Quota::execute(optional_yield y)
-{
-  bool uid_arg_existed = false;
-  std::string uid_str;
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
-  if (! uid_arg_existed) {
-    op_ret = -EINVAL;
-    return;
-  }
-  rgw_user uid(uid_str);
-  bool bucket_arg_existed = false;
-  std::string bucket_name;
-  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed);
-  if (! bucket_arg_existed) {
-    op_ret = -EINVAL;
-    return;
-  }
-
-  bool use_http_params;
-
-  if (s->content_length > 0) {
-    use_http_params = false;
-  } else {
-    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
-    use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
-  }
-  RGWQuotaInfo quota;
-  if (!use_http_params) {
-    bool empty;
-    op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
-    if (op_ret < 0) {
-      if (!empty)
-        return;
-      /* was probably chunked input, but no content provided, configure via http params */
-      use_http_params = true;
-    }
-  }
-  if (use_http_params) {
-    std::unique_ptr<rgw::sal::Bucket> bucket;
-    op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield);
-    if (op_ret < 0) {
-      return;
-    }
-    RGWQuotaInfo *old_quota = &bucket->get_info().quota;
-    int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
-    int64_t max_size_kb;
-    bool has_max_size_kb = false;
-    RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
-    RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
-    RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb);
-    if (has_max_size_kb)
-      quota.max_size = max_size_kb * 1024;
-    RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
-  }
-
-  RGWBucketAdminOpState op_state;
-  op_state.set_user_id(uid);
-  op_state.set_bucket_name(bucket_name);
-  op_state.set_quota(quota);
-
-  op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s);
-}
-
-class RGWOp_Sync_Bucket : public RGWRESTOp {
-
-public:
-  RGWOp_Sync_Bucket() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "sync_bucket"; }
-};
-
-void RGWOp_Sync_Bucket::execute(optional_yield y)
-{
-  std::string bucket;
-  std::string tenant;
-  bool sync_bucket;
-
-  RGWBucketAdminOpState op_state;
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-  RESTArgs::get_string(s, "tenant", tenant, &tenant);
-  RESTArgs::get_bool(s, "sync", true, &sync_bucket);
-
-  op_state.set_bucket_name(bucket);
-  op_state.set_tenant(tenant);
-  op_state.set_sync_bucket(sync_bucket);
-
-  op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s);
-}
-
-class RGWOp_Object_Remove: public RGWRESTOp {
-
-public:
-  RGWOp_Object_Remove() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("buckets", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "remove_object"; }
-};
-
-void RGWOp_Object_Remove::execute(optional_yield y)
-{
-  std::string bucket;
-  std::string object;
-
-  RGWBucketAdminOpState op_state;
-
-  RESTArgs::get_string(s, "bucket", bucket, &bucket);
-  RESTArgs::get_string(s, "object", object, &object);
-
-  op_state.set_bucket_name(bucket);
-  op_state.set_object(object);
-
-  op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s);
-}
-
-
-RGWOp *RGWHandler_Bucket::op_get()
-{
-
-  if (s->info.args.sub_resource_exists("policy"))
-    return new RGWOp_Get_Policy;
-
-  if (s->info.args.sub_resource_exists("index"))
-    return new RGWOp_Check_Bucket_Index;
-
-  return new RGWOp_Bucket_Info;
-}
-
-RGWOp *RGWHandler_Bucket::op_put()
-{
-  if (s->info.args.sub_resource_exists("quota"))
-    return new RGWOp_Set_Bucket_Quota;
-
-  if (s->info.args.sub_resource_exists("sync"))
-    return new RGWOp_Sync_Bucket;
-  
-  return new RGWOp_Bucket_Link;
-}
-
-RGWOp *RGWHandler_Bucket::op_post()
-{
-  return new RGWOp_Bucket_Unlink;
-}
-
-RGWOp *RGWHandler_Bucket::op_delete()
-{
-  if (s->info.args.sub_resource_exists("object"))
-    return new RGWOp_Object_Remove;
-
-  return new RGWOp_Bucket_Remove;
-}
diff --git a/src/rgw/rgw_rest_bucket.h b/src/rgw/rgw_rest_bucket.h

deleted file mode 100644 (file)

index 00f0b64..0000000
--- a/src/rgw/rgw_rest_bucket.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-
-
-class RGWHandler_Bucket : public RGWHandler_Auth_S3 {
-protected:
-  RGWOp *op_get() override;
-  RGWOp *op_put() override;
-  RGWOp *op_post() override;
-  RGWOp *op_delete() override;
-public:
-  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
-  ~RGWHandler_Bucket() override = default;
-
-  int read_permissions(RGWOp*, optional_yield y) override {
-    return 0;
-  }
-};
-
-class RGWRESTMgr_Bucket : public RGWRESTMgr {
-public:
-  RGWRESTMgr_Bucket() = default;
-  ~RGWRESTMgr_Bucket() override = default;
-
-  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
-                              req_state*,
-                               const rgw::auth::StrategyRegistry& auth_registry,
-                               const std::string&) override {
-    return new RGWHandler_Bucket(auth_registry);
-  }
-};
diff --git a/src/rgw/rgw_rest_log.cc b/src/rgw/rgw_rest_log.cc

deleted file mode 100644 (file)

index 3563cf0..0000000
--- a/src/rgw/rgw_rest_log.cc
+++ /dev/null
@@ -1,1267 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/ceph_json.h"
-#include "common/strtol.h"
-#include "rgw_rest.h"
-#include "rgw_op.h"
-#include "rgw_rest_s3.h"
-#include "rgw_rest_log.h"
-#include "rgw_client_io.h"
-#include "rgw_sync.h"
-#include "rgw_data_sync.h"
-#include "rgw_common.h"
-#include "rgw_zone.h"
-#include "rgw_mdlog.h"
-#include "rgw_datalog_notify.h"
-#include "rgw_trim_bilog.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_mdlog.h"
-#include "services/svc_bilog_rados.h"
-
-#include "common/errno.h"
-#include "include/ceph_assert.h"
-
-#define dout_context g_ceph_context
-#define LOG_CLASS_LIST_MAX_ENTRIES (1000)
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-void RGWOp_MDLog_List::execute(optional_yield y) {
-  string   period = s->info.args.get("period");
-  string   shard = s->info.args.get("id");
-  string   max_entries_str = s->info.args.get("max-entries");
-  string   marker = s->info.args.get("marker"),
-           err;
-  void    *handle;
-  unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-
-  if (s->info.args.exists("start-time") ||
-      s->info.args.exists("end-time")) {
-    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (!max_entries_str.empty()) {
-    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
-    if (!err.empty()) {
-      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
-      max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-    }
-  }
-
-  if (period.empty()) {
-    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
-    period = driver->get_zone()->get_current_period_id();
-    if (period.empty()) {
-      ldpp_dout(this, 5) << "Missing period id" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-  }
-
-  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-
-  meta_log.init_list_entries(shard_id, {}, {}, marker, &handle);
-
-  op_ret = meta_log.list_entries(this, handle, max_entries, entries,
-                                   &last_marker, &truncated);
-
-  meta_log.complete_list_entries(handle);
-}
-
-void RGWOp_MDLog_List::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  if (op_ret < 0)
-    return;
-
-  s->formatter->open_object_section("log_entries");
-  s->formatter->dump_string("marker", last_marker);
-  s->formatter->dump_bool("truncated", truncated);
-  {
-    s->formatter->open_array_section("entries");
-    for (list<cls_log_entry>::iterator iter = entries.begin();
-        iter != entries.end(); ++iter) {
-      cls_log_entry& entry = *iter;
-      static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter);
-      flusher.flush();
-    }
-    s->formatter->close_section();
-  }
-  s->formatter->close_section();
-  flusher.flush();
-}
-
-void RGWOp_MDLog_Info::execute(optional_yield y) {
-  num_objects = s->cct->_conf->rgw_md_log_max_shards;
-  period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->read_oldest_log_period(y, s);
-  op_ret = period.get_error();
-}
-
-void RGWOp_MDLog_Info::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  s->formatter->open_object_section("mdlog");
-  s->formatter->dump_unsigned("num_objects", num_objects);
-  if (period) {
-    s->formatter->dump_string("period", period.get_period().get_id());
-    s->formatter->dump_unsigned("realm_epoch", period.get_epoch());
-  }
-  s->formatter->close_section();
-  flusher.flush();
-}
-
-void RGWOp_MDLog_ShardInfo::execute(optional_yield y) {
-  string period = s->info.args.get("period");
-  string shard = s->info.args.get("id");
-  string err;
-
-  unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (period.empty()) {
-    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
-    period = driver->get_zone()->get_current_period_id();
-
-    if (period.empty()) {
-      ldpp_dout(this, 5) << "Missing period id" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-  }
-  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-
-  op_ret = meta_log.get_info(this, shard_id, &info);
-}
-
-void RGWOp_MDLog_ShardInfo::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  encode_json("info", info, s->formatter);
-  flusher.flush();
-}
-
-void RGWOp_MDLog_Delete::execute(optional_yield y) {
-  string   marker = s->info.args.get("marker"),
-           period = s->info.args.get("period"),
-           shard = s->info.args.get("id"),
-           err;
-  unsigned shard_id;
-
-
-  if (s->info.args.exists("start-time") ||
-      s->info.args.exists("end-time")) {
-    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
-    op_ret = -EINVAL;
-  }
-
-  if (s->info.args.exists("start-marker")) {
-    ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
-    op_ret = -EINVAL;
-  }
-
-  if (s->info.args.exists("end-marker")) {
-    if (!s->info.args.exists("marker")) {
-      marker = s->info.args.get("end-marker");
-    } else {
-      ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
-      op_ret = -EINVAL;
-    }
-  }
-
-  op_ret = 0;
-
-  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (marker.empty()) { /* bounding end */
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (period.empty()) {
-    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
-    period = driver->get_zone()->get_current_period_id();
-
-    if (period.empty()) {
-      ldpp_dout(this, 5) << "Missing period id" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-  }
-  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-
-  op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker);
-}
-
-void RGWOp_MDLog_Lock::execute(optional_yield y) {
-  string period, shard_id_str, duration_str, locker_id, zone_id;
-  unsigned shard_id;
-
-  op_ret = 0;
-
-  period       = s->info.args.get("period");
-  shard_id_str = s->info.args.get("id");
-  duration_str = s->info.args.get("length");
-  locker_id    = s->info.args.get("locker-id");
-  zone_id      = s->info.args.get("zone-id");
-
-  if (period.empty()) {
-    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
-    period = driver->get_zone()->get_current_period_id();
-  }
-
-  if (period.empty() ||
-      shard_id_str.empty() ||
-      (duration_str.empty()) ||
-      locker_id.empty() ||
-      zone_id.empty()) {
-    ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  string err;
-  shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-  unsigned dur;
-  dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err);
-  if (!err.empty() || dur <= 0) {
-    ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-  op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id,
-                                    locker_id);
-  if (op_ret == -EBUSY)
-    op_ret = -ERR_LOCKED;
-}
-
-void RGWOp_MDLog_Unlock::execute(optional_yield y) {
-  string period, shard_id_str, locker_id, zone_id;
-  unsigned shard_id;
-
-  op_ret = 0;
-
-  period       = s->info.args.get("period");
-  shard_id_str = s->info.args.get("id");
-  locker_id    = s->info.args.get("locker-id");
-  zone_id      = s->info.args.get("zone-id");
-
-  if (period.empty()) {
-    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
-    period = driver->get_zone()->get_current_period_id();
-  }
-
-  if (period.empty() ||
-      shard_id_str.empty() ||
-      locker_id.empty() ||
-      zone_id.empty()) {
-    ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  string err;
-  shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-  op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id);
-}
-
-void RGWOp_MDLog_Notify::execute(optional_yield y) {
-#define LARGE_ENOUGH_BUF (128 * 1024)
-
-  int r = 0;
-  bufferlist data;
-  std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
-  if (r < 0) {
-    op_ret = r;
-    return;
-  }
-
-  char* buf = data.c_str();
-  ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
-
-  JSONParser p;
-  r = p.parse(buf, data.length());
-  if (r < 0) {
-    ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
-    op_ret = r;
-    return;
-  }
-
-  set<int> updated_shards;
-  try {
-    decode_json_obj(updated_shards, &p);
-  } catch (JSONDecoder::err& err) {
-    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
-    for (set<int>::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
-      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl;
-    }
-  }
-
-  driver->wakeup_meta_sync_shards(updated_shards);
-
-  op_ret = 0;
-}
-
-void RGWOp_BILog_List::execute(optional_yield y) {
-  bool gen_specified = false;
-  string tenant_name = s->info.args.get("tenant"),
-         bucket_name = s->info.args.get("bucket"),
-         marker = s->info.args.get("marker"),
-         max_entries_str = s->info.args.get("max-entries"),
-         bucket_instance = s->info.args.get("bucket-instance"),
-         gen_str = s->info.args.get("generation", &gen_specified),
-         format_version_str = s->info.args.get("format-ver");
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
-
-  unsigned max_entries;
-
-  if (bucket_name.empty() && bucket_instance.empty()) {
-    ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  string err;
-  std::optional<uint64_t> gen;
-  if (gen_specified) {
-    gen = strict_strtoll(gen_str.c_str(), 10, &err);
-    if (!err.empty()) {
-      ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-  }
-
-  if (!format_version_str.empty()) {
-    format_ver = strict_strtoll(format_version_str.c_str(), 10, &err);
-    if (!err.empty()) {
-      ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-  }
-
-  int shard_id;
-  string bn;
-  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
-  if (op_ret < 0) {
-    return;
-  }
-
-  if (!bucket_instance.empty()) {
-    b.name = bn;
-    b.bucket_id = bucket_instance;
-  }
-  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
-    return;
-  }
-
-  const auto& logs = bucket->get_info().layout.logs;
-  if (logs.empty()) {
-    ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
-    op_ret = -ENOENT;
-    return;
-  }
-
-  auto log = std::prev(logs.end());
-  if (gen) {
-    log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
-    if (log == logs.end()) {
-      ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl;
-      op_ret = -ENOENT;
-      return;
-    }
-  }
-  if (auto next = std::next(log); next != logs.end()) {
-    next_log_layout = *next;   // get the next log after the current latest
-  }
-  auto& log_layout = *log; // current log layout for log listing
-
-  unsigned count = 0;
-
-
-  max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
-  if (!err.empty())
-    max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-
-  send_response();
-  do {
-    list<rgw_bi_log_entry> entries;
-    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id,
-                                               marker, max_entries - count,
-                                               entries, &truncated);
-    if (ret < 0) {
-      ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl;
-      return;
-    }
-
-    count += entries.size();
-
-    send_response(entries, marker);
-  } while (truncated && count < max_entries);
-
-  send_response_end();
-}
-
-void RGWOp_BILog_List::send_response() {
-  if (sent_header)
-    return;
-
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  sent_header = true;
-
-  if (op_ret < 0)
-    return;
-
-  if (format_ver >= 2) {
-    s->formatter->open_object_section("result");
-  }
-
-  s->formatter->open_array_section("entries");
-}
-
-void RGWOp_BILog_List::send_response(list<rgw_bi_log_entry>& entries, string& marker)
-{
-  for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
-    rgw_bi_log_entry& entry = *iter;
-    encode_json("entry", entry, s->formatter);
-
-    marker = entry.id;
-    flusher.flush();
-  }
-}
-
-void RGWOp_BILog_List::send_response_end() {
-  s->formatter->close_section();
-
-  if (format_ver >= 2) {
-    encode_json("truncated", truncated, s->formatter);
-
-    if (next_log_layout) {
-      s->formatter->open_object_section("next_log");
-      encode_json("generation", next_log_layout->gen, s->formatter);
-      encode_json("num_shards", next_log_layout->layout.in_index.layout.num_shards, s->formatter);
-      s->formatter->close_section(); // next_log
-    }
-
-    s->formatter->close_section(); // result
-  }
-
-  flusher.flush();
-}
-
-void RGWOp_BILog_Info::execute(optional_yield y) {
-  string tenant_name = s->info.args.get("tenant"),
-         bucket_name = s->info.args.get("bucket"),
-         bucket_instance = s->info.args.get("bucket-instance");
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
-
-  if (bucket_name.empty() && bucket_instance.empty()) {
-    ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  int shard_id;
-  string bn;
-  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
-  if (op_ret < 0) {
-    return;
-  }
-
-  if (!bucket_instance.empty()) {
-    b.name = bn;
-    b.bucket_id = bucket_instance;
-  }
-  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
-    return;
-  }
-
-  const auto& logs = bucket->get_info().layout.logs;
-  if (logs.empty()) {
-    ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
-    op_ret = -ENOENT;
-    return;
-  }
-
-  map<RGWObjCategory, RGWStorageStats> stats;
-  const auto& index = log_to_index_layout(logs.back());
-
-  int ret =  bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
-  if (ret < 0 && ret != -ENOENT) {
-    op_ret = ret;
-    return;
-  }
-
-  oldest_gen = logs.front().gen;
-  latest_gen = logs.back().gen;
-
-  for (auto& log : logs) {
-      uint32_t num_shards = log.layout.in_index.layout.num_shards;
-      generations.push_back({log.gen, num_shards});
-  }
-}
-
-void RGWOp_BILog_Info::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  if (op_ret < 0)
-    return;
-
-  s->formatter->open_object_section("info");
-  encode_json("bucket_ver", bucket_ver, s->formatter);
-  encode_json("master_ver", master_ver, s->formatter);
-  encode_json("max_marker", max_marker, s->formatter);
-  encode_json("syncstopped", syncstopped, s->formatter);
-  encode_json("oldest_gen", oldest_gen, s->formatter);
-  encode_json("latest_gen", latest_gen, s->formatter);
-  encode_json("generations", generations, s->formatter);
-  s->formatter->close_section();
-
-  flusher.flush();
-}
-
-void RGWOp_BILog_Delete::execute(optional_yield y) {
-  bool gen_specified = false;
-  string tenant_name = s->info.args.get("tenant"),
-         bucket_name = s->info.args.get("bucket"),
-         start_marker = s->info.args.get("start-marker"),
-         end_marker = s->info.args.get("end-marker"),
-         bucket_instance = s->info.args.get("bucket-instance"),
-        gen_str = s->info.args.get("generation", &gen_specified);
-
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
-
-  op_ret = 0;
-  if ((bucket_name.empty() && bucket_instance.empty()) ||
-      end_marker.empty()) {
-    ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  string err;
-  uint64_t gen = 0;
-  if (gen_specified) {
-    gen = strict_strtoll(gen_str.c_str(), 10, &err);
-    if (!err.empty()) {
-      ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-  }
-
-  int shard_id;
-  string bn;
-  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
-  if (op_ret < 0) {
-    return;
-  }
-
-  if (!bucket_instance.empty()) {
-    b.name = bn;
-    b.bucket_id = bucket_instance;
-  }
-  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
-    return;
-  }
-
-  op_ret = bilog_trim(this, static_cast<rgw::sal::RadosStore*>(driver),
-                     bucket->get_info(), gen, shard_id,
-                     start_marker, end_marker);
-  if (op_ret < 0) {
-    ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl;
-  }
-
-  return;
-}
-
-void RGWOp_DATALog_List::execute(optional_yield y) {
-  string   shard = s->info.args.get("id");
-
-  string   max_entries_str = s->info.args.get("max-entries"),
-           marker = s->info.args.get("marker"),
-           err;
-  unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-
-  if (s->info.args.exists("start-time") ||
-      s->info.args.exists("end-time")) {
-    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
-    op_ret = -EINVAL;
-  }
-
-  s->info.args.get_bool("extra-info", &extra_info, false);
-
-  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (!max_entries_str.empty()) {
-    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
-    if (!err.empty()) {
-      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
-      max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-    }
-  }
-
-  // Note that last_marker is updated to be the marker of the last
-  // entry listed
-  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->list_entries(this, shard_id,
-                                                    max_entries, entries,
-                                                    marker, &last_marker,
-                                                    &truncated);
-}
-
-void RGWOp_DATALog_List::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  if (op_ret < 0)
-    return;
-
-  s->formatter->open_object_section("log_entries");
-  s->formatter->dump_string("marker", last_marker);
-  s->formatter->dump_bool("truncated", truncated);
-  {
-    s->formatter->open_array_section("entries");
-    for (const auto& entry : entries) {
-      if (!extra_info) {
-        encode_json("entry", entry.entry, s->formatter);
-      } else {
-        encode_json("entry", entry, s->formatter);
-      }
-      flusher.flush();
-    }
-    s->formatter->close_section();
-  }
-  s->formatter->close_section();
-  flusher.flush();
-}
-
-
-void RGWOp_DATALog_Info::execute(optional_yield y) {
-  num_objects = s->cct->_conf->rgw_data_log_num_shards;
-  op_ret = 0;
-}
-
-void RGWOp_DATALog_Info::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  s->formatter->open_object_section("num_objects");
-  s->formatter->dump_unsigned("num_objects", num_objects);
-  s->formatter->close_section();
-  flusher.flush();
-}
-
-void RGWOp_DATALog_ShardInfo::execute(optional_yield y) {
-  string shard = s->info.args.get("id");
-  string err;
-
-  unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->get_info(this, shard_id, &info);
-}
-
-void RGWOp_DATALog_ShardInfo::send_response() {
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  encode_json("info", info, s->formatter);
-  flusher.flush();
-}
-
-void RGWOp_DATALog_Notify::execute(optional_yield y) {
-  string  source_zone = s->info.args.get("source-zone");
-#define LARGE_ENOUGH_BUF (128 * 1024)
-
-  int r = 0;
-  bufferlist data;
-  std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
-  if (r < 0) {
-    op_ret = r;
-    return;
-  }
-
-  char* buf = data.c_str();
-  ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
-
-  JSONParser p;
-  r = p.parse(buf, data.length());
-  if (r < 0) {
-    ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
-    op_ret = r;
-    return;
-  }
-
-  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> updated_shards;
-  try {
-    auto decoder = rgw_data_notify_v1_decoder{updated_shards};
-    decode_json_obj(decoder, &p);
-  } catch (JSONDecoder::err& err) {
-    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
-    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
-      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
-      bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
-      for (const auto& [key, gen] : entries) {
-        ldpp_dout(this, 20) << __func__ << "(): modified key=" << key
-        << " of gen=" << gen << dendl;
-      }
-    }
-  }
-
-  driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
-
-  op_ret = 0;
-}
-
-void RGWOp_DATALog_Notify2::execute(optional_yield y) {
-  string  source_zone = s->info.args.get("source-zone");
-#define LARGE_ENOUGH_BUF (128 * 1024)
-
-  int r = 0;
-  bufferlist data;
-  std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF);
-  if (r < 0) {
-    op_ret = r;
-    return;
-  }
-
-  char* buf = data.c_str();
-  ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl;
-
-  JSONParser p;
-  r = p.parse(buf, data.length());
-  if (r < 0) {
-    ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl;
-    op_ret = r;
-    return;
-  }
-
-  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> > updated_shards;
-  try {
-    decode_json_obj(updated_shards, &p);
-  } catch (JSONDecoder::err& err) {
-    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
-    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter =
-        updated_shards.begin(); iter != updated_shards.end(); ++iter) {
-      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
-      bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
-      for (const auto& [key, gen] : entries) {
-        ldpp_dout(this, 20) << __func__ << "(): modified key=" << key <<
-        " of generation=" << gen << dendl;
-      }
-    }
-  }
-
-  driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
-
-  op_ret = 0;
-}
-
-void RGWOp_DATALog_Delete::execute(optional_yield y) {
-  string   marker = s->info.args.get("marker"),
-           shard = s->info.args.get("id"),
-           err;
-  unsigned shard_id;
-
-  op_ret = 0;
-
-  if (s->info.args.exists("start-time") ||
-      s->info.args.exists("end-time")) {
-    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
-    op_ret = -EINVAL;
-  }
-
-  if (s->info.args.exists("start-marker")) {
-    ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
-    op_ret = -EINVAL;
-  }
-
-  if (s->info.args.exists("end-marker")) {
-    if (!s->info.args.exists("marker")) {
-      marker = s->info.args.get("end-marker");
-    } else {
-      ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
-      op_ret = -EINVAL;
-    }
-  }
-
-  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
-  if (!err.empty()) {
-    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-  if (marker.empty()) { /* bounding end */
-    op_ret = -EINVAL;
-    return;
-  }
-
-  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->trim_entries(this, shard_id, marker);
-}
-
-// not in header to avoid pulling in rgw_sync.h
-class RGWOp_MDLog_Status : public RGWRESTOp {
-  rgw_meta_sync_status status;
-public:
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override { return "get_metadata_log_status"; }
-};
-
-void RGWOp_MDLog_Status::execute(optional_yield y)
-{
-  auto sync = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_meta_sync_manager();
-  if (sync == nullptr) {
-    ldpp_dout(this, 1) << "no sync manager" << dendl;
-    op_ret = -ENOENT;
-    return;
-  }
-  op_ret = sync->read_sync_status(this, &status);
-}
-
-void RGWOp_MDLog_Status::send_response()
-{
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  if (op_ret >= 0) {
-    encode_json("status", status, s->formatter);
-  }
-  flusher.flush();
-}
-
-// not in header to avoid pulling in rgw_data_sync.h
-class RGWOp_BILog_Status : public RGWRESTOp {
-  bilog_status_v2 status;
-  int version = 1;
-public:
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("bilog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override { return "get_bucket_index_log_status"; }
-};
-
-void RGWOp_BILog_Status::execute(optional_yield y)
-{
-  const auto options = s->info.args.get("options");
-  bool merge = (options == "merge");
-  const auto source_zone = s->info.args.get("source-zone");
-  const auto source_key = s->info.args.get("source-bucket");
-  auto key = s->info.args.get("bucket");
-  op_ret = s->info.args.get_int("version", &version, 1);
-
-  if (key.empty()) {
-    key = source_key;
-  }
-  if (key.empty()) {
-    ldpp_dout(this, 4) << "no 'bucket' provided" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  rgw_bucket b;
-  int shard_id{-1}; // unused
-  op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id);
-  if (op_ret < 0) {
-    ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  // read the bucket instance info for num_shards
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl;
-    return;
-  }
-
-  rgw_bucket source_bucket;
-
-  if (source_key.empty() ||
-      source_key == key) {
-    source_bucket = bucket->get_key();
-  } else {
-    op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr);
-    if (op_ret < 0) {
-      ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl;
-      return;
-    }
-  }
-
-  const auto& local_zone_id = driver->get_zone()->get_id();
-
-  if (!merge) {
-    rgw_sync_bucket_pipe pipe;
-    pipe.source.zone = source_zone;
-    pipe.source.bucket = source_bucket;
-    pipe.dest.zone = local_zone_id;
-    pipe.dest.bucket = bucket->get_key();
-
-    ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
-
-    op_ret = rgw_read_bucket_full_sync_status(
-      this,
-      static_cast<rgw::sal::RadosStore*>(driver),
-      pipe,
-      &status.sync_status,
-      s->yield);
-    if (op_ret < 0) {
-      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
-      return;
-    }
-    status.inc_status.resize(status.sync_status.shards_done_with_gen.size());
-
-    op_ret = rgw_read_bucket_inc_sync_status(
-      this,
-      static_cast<rgw::sal::RadosStore*>(driver),
-      pipe,
-      status.sync_status.incremental_gen,
-      &status.inc_status);
-    if (op_ret < 0) {
-      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
-    }
-    return;
-  }
-
-  rgw_zone_id source_zone_id(source_zone);
-
-  RGWBucketSyncPolicyHandlerRef source_handler;
-  op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl;
-    return;
-  }
-
-  auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id);
-
-  std::vector<rgw_bucket_shard_sync_info> current_status;
-  for (auto& entry : local_dests) {
-    auto pipe = entry.second;
-
-    ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
-
-    RGWBucketInfo *pinfo = &bucket->get_info();
-    std::optional<RGWBucketInfo> opt_dest_info;
-
-    if (!pipe.dest.bucket) {
-      /* Uh oh, something went wrong */
-      ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl;
-      op_ret = -EIO;
-      return;
-    }
-
-    if (*pipe.dest.bucket != pinfo->bucket) {
-      opt_dest_info.emplace();
-      std::unique_ptr<rgw::sal::Bucket> dest_bucket;
-      op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y);
-      if (op_ret < 0) {
-        ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl;
-        return;
-      }
-
-      *opt_dest_info = dest_bucket->get_info();
-      pinfo = &(*opt_dest_info);
-      pipe.dest.bucket = pinfo->bucket;
-    }
-
-    op_ret = rgw_read_bucket_full_sync_status(
-      this,
-      static_cast<rgw::sal::RadosStore*>(driver),
-      pipe,
-      &status.sync_status,
-      s->yield);
-    if (op_ret < 0) {
-      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
-      return;
-    }
-
-    current_status.resize(status.sync_status.shards_done_with_gen.size());
-    int r = rgw_read_bucket_inc_sync_status(this, static_cast<rgw::sal::RadosStore*>(driver),
-                                           pipe, status.sync_status.incremental_gen, &current_status);
-    if (r < 0) {
-      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl;
-      op_ret = r;
-      return;
-    }
-
-    if (status.inc_status.empty()) {
-      status.inc_status = std::move(current_status);
-    } else {
-      if (current_status.size() != status.inc_status.size()) {
-        op_ret = -EINVAL;
-        ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets "
-         "syncing from the same source: status.size()= "
-                           << status.inc_status.size()
-                           << " current_status.size()="
-                           << current_status.size() << dendl;
-       return;
-      }
-      auto m = status.inc_status.begin();
-      for (auto& cur_shard_status : current_status) {
-        auto& result_shard_status = *m++;
-        // always take the first marker, or any later marker that's smaller
-        if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) {
-          result_shard_status = std::move(cur_shard_status);
-        }
-      }
-    }
-  }
-}
-
-void RGWOp_BILog_Status::send_response()
-{
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  if (op_ret >= 0) {
-    if (version < 2) {
-      encode_json("status", status.inc_status, s->formatter);
-    } else {
-      encode_json("status", status, s->formatter);
-    }
-  }
-  flusher.flush();
-}
-
-// not in header to avoid pulling in rgw_data_sync.h
-class RGWOp_DATALog_Status : public RGWRESTOp {
-  rgw_data_sync_status status;
-public:
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override ;
-  void send_response() override;
-  const char* name() const override { return "get_data_changes_log_status"; }
-};
-
-void RGWOp_DATALog_Status::execute(optional_yield y)
-{
-  const auto source_zone = s->info.args.get("source-zone");
-  auto sync = driver->get_data_sync_manager(source_zone);
-  if (sync == nullptr) {
-    ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl;
-    op_ret = -ENOENT;
-    return;
-  }
-  op_ret = sync->read_sync_status(this, &status);
-}
-
-void RGWOp_DATALog_Status::send_response()
-{
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-  end_header(s);
-
-  if (op_ret >= 0) {
-    encode_json("status", status, s->formatter);
-  }
-  flusher.flush();
-}
-
-
-RGWOp *RGWHandler_Log::op_get() {
-  bool exists;
-  string type = s->info.args.get("type", &exists);
-
-  if (!exists) {
-    return NULL;
-  }
-
-  if (type.compare("metadata") == 0) {
-    if (s->info.args.exists("id")) {
-      if (s->info.args.exists("info")) {
-        return new RGWOp_MDLog_ShardInfo;
-      } else {
-        return new RGWOp_MDLog_List;
-      }
-    } else if (s->info.args.exists("status")) {
-      return new RGWOp_MDLog_Status;
-    } else {
-      return new RGWOp_MDLog_Info;
-    }
-  } else if (type.compare("bucket-index") == 0) {
-    if (s->info.args.exists("info")) {
-      return new RGWOp_BILog_Info;
-    } else if (s->info.args.exists("status")) {
-      return new RGWOp_BILog_Status;
-    } else {
-      return new RGWOp_BILog_List;
-    }
-  } else if (type.compare("data") == 0) {
-    if (s->info.args.exists("id")) {
-      if (s->info.args.exists("info")) {
-        return new RGWOp_DATALog_ShardInfo;
-      } else {
-        return new RGWOp_DATALog_List;
-      }
-    } else if (s->info.args.exists("status")) {
-      return new RGWOp_DATALog_Status;
-    } else {
-      return new RGWOp_DATALog_Info;
-    }
-  }
-  return NULL;
-}
-
-RGWOp *RGWHandler_Log::op_delete() {
-  bool exists;
-  string type = s->info.args.get("type", &exists);
-
-  if (!exists) {
-    return NULL;
-  }
-
-  if (type.compare("metadata") == 0)
-    return new RGWOp_MDLog_Delete;
-  else if (type.compare("bucket-index") == 0) 
-    return new RGWOp_BILog_Delete;
-  else if (type.compare("data") == 0)
-    return new RGWOp_DATALog_Delete;
-  return NULL;
-}
-
-RGWOp *RGWHandler_Log::op_post() {
-  bool exists;
-  string type = s->info.args.get("type", &exists);
-
-  if (!exists) {
-    return NULL;
-  }
-
-  if (type.compare("metadata") == 0) {
-    if (s->info.args.exists("lock"))
-      return new RGWOp_MDLog_Lock;
-    else if (s->info.args.exists("unlock"))
-      return new RGWOp_MDLog_Unlock;
-    else if (s->info.args.exists("notify"))
-      return new RGWOp_MDLog_Notify;
-  } else if (type.compare("data") == 0) {
-    if (s->info.args.exists("notify")) {
-      return new RGWOp_DATALog_Notify;
-    } else if (s->info.args.exists("notify2")) {
-      return new RGWOp_DATALog_Notify2;
-    }
-  }
-  return NULL;
-}
-
diff --git a/src/rgw/rgw_rest_log.h b/src/rgw/rgw_rest_log.h

deleted file mode 100644 (file)

index c8a0c4d..0000000
--- a/src/rgw/rgw_rest_log.h
+++ /dev/null
@@ -1,337 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_datalog.h"
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-#include "rgw_metadata.h"
-#include "rgw_mdlog.h"
-#include "rgw_data_sync.h"
-
-class RGWOp_BILog_List : public RGWRESTOp {
-  bool sent_header;
-  uint32_t format_ver{0};
-  bool truncated{false};
-  std::optional<rgw::bucket_log_layout_generation> next_log_layout;
-
-public:
-  RGWOp_BILog_List() : sent_header(false) {}
-  ~RGWOp_BILog_List() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("bilog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void send_response() override;
-  virtual void send_response(std::list<rgw_bi_log_entry>& entries, std::string& marker);
-  virtual void send_response_end();
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "list_bucket_index_log";
-  }
-};
-
-class RGWOp_BILog_Info : public RGWRESTOp {
-  std::string bucket_ver;
-  std::string master_ver;
-  std::string max_marker;
-  bool syncstopped;
-  uint64_t oldest_gen = 0;
-  uint64_t latest_gen = 0;
-  std::vector<store_gen_shards> generations;
-
-public:
-  RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
-  ~RGWOp_BILog_Info() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("bilog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void send_response() override;
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "bucket_index_log_info";
-  }
-};
-
-class RGWOp_BILog_Delete : public RGWRESTOp {
-public:
-  RGWOp_BILog_Delete() {}
-  ~RGWOp_BILog_Delete() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("bilog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "trim_bucket_index_log";
-  }
-};
-
-class RGWOp_MDLog_List : public RGWRESTOp {
-  std::list<cls_log_entry> entries;
-  std::string last_marker;
-  bool truncated;
-public:
-  RGWOp_MDLog_List() : truncated(false) {}
-  ~RGWOp_MDLog_List() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override {
-    return "list_metadata_log";
-  }
-};
-
-class RGWOp_MDLog_Info : public RGWRESTOp {
-  unsigned num_objects;
-  RGWPeriodHistory::Cursor period;
-public:
-  RGWOp_MDLog_Info() : num_objects(0) {}
-  ~RGWOp_MDLog_Info() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override {
-    return "get_metadata_log_info";
-  }
-};
-
-class RGWOp_MDLog_ShardInfo : public RGWRESTOp {
-  RGWMetadataLogInfo info;
-public:
-  RGWOp_MDLog_ShardInfo() {}
-  ~RGWOp_MDLog_ShardInfo() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override {
-    return "get_metadata_log_shard_info";
-  }
-};
-
-class RGWOp_MDLog_Lock : public RGWRESTOp {
-public:
-  RGWOp_MDLog_Lock() {}
-  ~RGWOp_MDLog_Lock() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "lock_mdlog_object";
-  }
-};
-
-class RGWOp_MDLog_Unlock : public RGWRESTOp {
-public:
-  RGWOp_MDLog_Unlock() {}
-  ~RGWOp_MDLog_Unlock() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "unlock_mdlog_object";
-  }
-};
-
-class RGWOp_MDLog_Notify : public RGWRESTOp {
-public:
-  RGWOp_MDLog_Notify() {}
-  ~RGWOp_MDLog_Notify() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "mdlog_notify";
-  }
-  RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; }
-};
-
-class RGWOp_MDLog_Delete : public RGWRESTOp {
-public:
-  RGWOp_MDLog_Delete() {}
-  ~RGWOp_MDLog_Delete() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("mdlog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "trim_metadata_log";
-  }
-};
-
-class RGWOp_DATALog_List : public RGWRESTOp {
-  std::vector<rgw_data_change_log_entry> entries;
-  std::string last_marker;
-  bool truncated;
-  bool extra_info;
-public:
-  RGWOp_DATALog_List() : truncated(false), extra_info(false) {}
-  ~RGWOp_DATALog_List() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override {
-    return "list_data_changes_log";
-  }
-};
-
-class RGWOp_DATALog_Info : public RGWRESTOp {
-  unsigned num_objects;
-public:
-  RGWOp_DATALog_Info() : num_objects(0) {}
-  ~RGWOp_DATALog_Info() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override {
-    return "get_data_changes_log_info";
-  }
-};
-
-class RGWOp_DATALog_ShardInfo : public RGWRESTOp {
-  RGWDataChangesLogInfo info;
-public:
-  RGWOp_DATALog_ShardInfo() {}
-  ~RGWOp_DATALog_ShardInfo() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield y) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override {
-    return "get_data_changes_log_shard_info";
-  }
-};
-
-class RGWOp_DATALog_Notify : public RGWRESTOp {
-public:
-  RGWOp_DATALog_Notify() {}
-  ~RGWOp_DATALog_Notify() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "datalog_notify";
-  }
-  RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; }
-};
-
-class RGWOp_DATALog_Notify2 : public RGWRESTOp {
-  rgw_data_notify_entry data_notify;
-public:
-  RGWOp_DATALog_Notify2() {}
-  ~RGWOp_DATALog_Notify2() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "datalog_notify2";
-  }
-  RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; }
-};
-
-class RGWOp_DATALog_Delete : public RGWRESTOp {
-public:
-  RGWOp_DATALog_Delete() {}
-  ~RGWOp_DATALog_Delete() override {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("datalog", RGW_CAP_WRITE);
-  }
-  void execute(optional_yield y) override;
-  const char* name() const override {
-    return "trim_data_changes_log";
-  }
-};
-
-class RGWHandler_Log : public RGWHandler_Auth_S3 {
-protected:
-  RGWOp *op_get() override;
-  RGWOp *op_delete() override;
-  RGWOp *op_post() override;
-
-  int read_permissions(RGWOp*, optional_yield) override {
-    return 0;
-  }
-public:
-  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
-  ~RGWHandler_Log() override = default;
-};
-
-class RGWRESTMgr_Log : public RGWRESTMgr {
-public:
-  RGWRESTMgr_Log() = default;
-  ~RGWRESTMgr_Log() override = default;
-
-  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
-                              req_state* const,
-                               const rgw::auth::StrategyRegistry& auth_registry,
-                               const std::string& frontend_prefixs) override {
-    return new RGWHandler_Log(auth_registry);
-  }
-};
diff --git a/src/rgw/rgw_role.h b/src/rgw/rgw_role.h

index 868578924f51781057c02e5f8f2b5df1761e2f4a..9183829d976b0417fc2ca8f971557b7495a3f83f 100644 (file)
--- a/src/rgw/rgw_role.h
+++ b/src/rgw/rgw_role.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_ROLE_H
-#define CEPH_RGW_ROLE_H
+#pragma once
  
  #include <string>
  
@@ -10,7 +9,7 @@
  
  #include "common/ceph_json.h"
  #include "common/ceph_context.h"
-#include "rgw/rgw_rados.h"
+#include "rgw_rados.h"
  #include "rgw_metadata.h"
  
  class RGWRados;
@@ -208,5 +207,3 @@ private:
    Driver* driver;
  };
  } } // namespace rgw::sal
-
-#endif /* CEPH_RGW_ROLE_H */
diff --git a/src/rgw/rgw_s3select.h b/src/rgw/rgw_s3select.h

index 00d55cf93ea32ef4cb633cd0025f63c418147676..4a506ba4c0a4bf7382caf8fded125f2f65491308 100644 (file)
--- a/src/rgw/rgw_s3select.h
+++ b/src/rgw/rgw_s3select.h
@@ -2,6 +2,8 @@
  // vim: ts=8 sw=2 smarttab ft=cpp
  //
  
+#pragma once
+
  namespace rgw::s3select {
  RGWOp* create_s3select_op();
  }
diff --git a/src/rgw/rgw_string.h b/src/rgw/rgw_string.h

index 90e64f98a25874f27c7bd40169cfac3a1f5ebcf9..e58a356f4715ce3e34c0959e4c06d4a23832b879 100644 (file)
--- a/src/rgw/rgw_string.h
+++ b/src/rgw/rgw_string.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_STRING_H
-#define CEPH_RGW_STRING_H
+#pragma once
  
  #include <errno.h>
  #include <stdlib.h>
@@ -234,5 +233,3 @@ static constexpr uint32_t MATCH_CASE_INSENSITIVE = 0x01;
  extern bool match_wildcards(std::string_view pattern,
                              std::string_view input,
                              uint32_t flags = 0);
-
-#endif
diff --git a/src/rgw/rgw_sts.h b/src/rgw/rgw_sts.h

index f73be37658972c1adb53e5d13f08ccdbe112b30e..65dbb17477f151fb4c9a615d03b1954f94d5d190 100644 (file)
--- a/src/rgw/rgw_sts.h
+++ b/src/rgw/rgw_sts.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_STS_H
-#define CEPH_RGW_STS_H
+#pragma once
  
  #include "rgw_role.h"
  #include "rgw_auth.h"
@@ -251,4 +250,3 @@ public:
    AssumeRoleWithWebIdentityResponse assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req);
  };
  }
-#endif /* CEPH_RGW_STS_H */
diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h

index 1d5a54dde6003e29d117d11e3f9aad978706e451..4b71dbc705af79e5cf2317f653cc6acd84d431bb 100644 (file)
--- a/src/rgw/rgw_swift_auth.h
+++ b/src/rgw/rgw_swift_auth.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_SWIFT_AUTH_H
-#define CEPH_RGW_SWIFT_AUTH_H
+#pragma once
  
  #include "rgw_common.h"
  #include "rgw_user.h"
@@ -353,6 +352,3 @@ public:
      return new RGWHandler_SWIFT_Auth;
    }
  };
-
-
-#endif
diff --git a/src/rgw/rgw_tag.h b/src/rgw/rgw_tag.h

index 88a4e6652288a0a45c383df2ce7c9118c3a1d8f3..15bb25ee832cf83b01061c183705e76c3b0f94a4 100644 (file)
--- a/src/rgw/rgw_tag.h
+++ b/src/rgw/rgw_tag.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_TAG_H
-#define RGW_TAG_H
+#pragma once
  
  #include <string>
  #include <include/types.h>
@@ -48,5 +47,3 @@ protected:
    tag_map_t& get_tags() {return tag_map;}
  };
  WRITE_CLASS_ENCODER(RGWObjTags)
-
-#endif /* RGW_TAG_H */
diff --git a/src/rgw/rgw_tag_s3.h b/src/rgw/rgw_tag_s3.h

index e07e081fb48afb2ef735fef4a1ce5006fcdebe11..7cc892f1f669e3c5a2ac8630d6e4a39e06094c40 100644 (file)
--- a/src/rgw/rgw_tag_s3.h
+++ b/src/rgw/rgw_tag_s3.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef RGW_TAG_S3_H
-#define RGW_TAG_S3_H
+#pragma once
  
  #include <map>
  #include <string>
@@ -48,6 +47,3 @@ public:
      return tagset.rebuild(dest);
    }
  };
-
-
-#endif /* RGW_TAG_S3_H */
diff --git a/src/rgw/rgw_tar.h b/src/rgw/rgw_tar.h

index f51a7fc66407b8a0d70e3633099aa24590ac18cb..b06943a3c1852c8cee1d8da2ac37607818047410 100644 (file)
--- a/src/rgw/rgw_tar.h
+++ b/src/rgw/rgw_tar.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_TAR_H
-#define CEPH_RGW_TAR_H
+#pragma once
  
  #include <algorithm>
  #include <array>
@@ -152,5 +151,3 @@ interpret_block(const StatusIndicator& status, ceph::bufferlist& bl) {
  
  } /* namespace tar */
  } /* namespace rgw */
-
-#endif /* CEPH_RGW_TAR_H */
diff --git a/src/rgw/rgw_token.h b/src/rgw/rgw_token.h

index 9505f970c6037aa2f9eb4c7d9a83af591311c6fa..b2476596bec349efce1286bf22a33143d9b997fc 100644 (file)
--- a/src/rgw/rgw_token.h
+++ b/src/rgw/rgw_token.h
@@ -13,8 +13,7 @@
   *
   */
  
-#ifndef RGW_TOKEN_H
-#define RGW_TOKEN_H
+#pragma once
  
  #include <stdint.h>
  #include <boost/algorithm/string.hpp>
@@ -169,5 +168,3 @@ namespace rgw {
    }
  
  } /* namespace rgw */
-
-#endif /* RGW_TOKEN_H */
diff --git a/src/rgw/rgw_torrent.h b/src/rgw/rgw_torrent.h

index 1f62ced35179944840d54141fa3145d685fbb54d..704dba28cf8659faaa787159143ff5e530df3444 100644 (file)
--- a/src/rgw/rgw_torrent.h
+++ b/src/rgw/rgw_torrent.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_TORRENT_H
-#define CEPH_RGW_TORRENT_H
+#pragma once
  
  #include <string>
  #include <list>
@@ -138,4 +137,3 @@ private:
    void sha1(SHA1 *h, bufferlist &bl, off_t bl_len);
    int save_torrent_file(optional_yield y);
  };
-#endif /* CEPH_RGW_TORRENT_H */
diff --git a/src/rgw/rgw_usage.h b/src/rgw/rgw_usage.h

index ec596ed75469f7ecb991cf3189667c47cd52de15..b12b57df0d84be93faca254a75aac706ad7d0647 100644 (file)
--- a/src/rgw/rgw_usage.h
+++ b/src/rgw/rgw_usage.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_USAGE_H
-#define CEPH_RGW_USAGE_H
+#pragma once
  
  #include <string>
  #include <map>
@@ -29,6 +28,3 @@ public:
  
    static int clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver);
  };
-
-
-#endif
diff --git a/src/rgw/rgw_web_idp.h b/src/rgw/rgw_web_idp.h

index 089c4da665dc29cd89ea3219ccd388f86ddc3e97..a9aa5b82916e200aacfdfabb94aeb0a22adab5be 100644 (file)
--- a/src/rgw/rgw_web_idp.h
+++ b/src/rgw/rgw_web_idp.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_WEB_IDP_H
-#define CEPH_RGW_WEB_IDP_H
+#pragma once
  
  namespace rgw {
  namespace web_idp {
@@ -25,5 +24,3 @@ struct WebTokenClaims {
  
  }; /* namespace web_idp */
  }; /* namespace rgw */
-
-#endif /* CEPH_RGW_WEB_IDP_H */
diff --git a/src/rgw/rgw_website.h b/src/rgw/rgw_website.h

index fa3c19578fad0359c0c27ac6eb13da2ccfdc298c..2ba22c87a8c70b2ec524ef8903044df8bd7fca9a 100644 (file)
--- a/src/rgw/rgw_website.h
+++ b/src/rgw/rgw_website.h
@@ -14,8 +14,7 @@
   * 
   */
  
-#ifndef RGW_WEBSITE_H
-#define RGW_WEBSITE_H
+#pragma once
  
  #include <list>
  #include <string>
@@ -242,5 +241,3 @@ struct RGWBucketWebsiteConf
    }
  };
  WRITE_CLASS_ENCODER(RGWBucketWebsiteConf)
-
-#endif
diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h

index 6a34344fbeb990a3ed1e6de6a1d627e9e72b2254..ab3f267ade1bd21c70745b2596ee6b117682fd13 100644 (file)
--- a/src/rgw/rgw_xml.h
+++ b/src/rgw/rgw_xml.h
@@ -1,8 +1,7 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab ft=cpp
  
-#ifndef CEPH_RGW_XML_H
-#define CEPH_RGW_XML_H
+#pragma once
  
  #include <map>
  #include <stdexcept>
@@ -370,6 +369,3 @@ static void encode_xml(const char *name, const std::optional<T>& o, ceph::Format
  
    encode_xml(name, *o, f);
  }
-
-
-#endif
diff --git a/src/rgw/services/svc_bucket_sobj.h b/src/rgw/services/svc_bucket_sobj.h

index 93e2063a7096937b1db3efe8ce42ad2441c24d34..8e9fe063c1b132fe6f519e4d7e6f8b1ed2ea8f77 100644 (file)
--- a/src/rgw/services/svc_bucket_sobj.h
+++ b/src/rgw/services/svc_bucket_sobj.h
@@ -22,13 +22,13 @@
  #include "svc_meta_be.h"
  #include "svc_bucket_types.h"
  #include "svc_bucket.h"
+#include "svc_bucket_sync.h"
  
  class RGWSI_Zone;
  class RGWSI_SysObj;
  class RGWSI_SysObj_Cache;
  class RGWSI_Meta;
  class RGWSI_SyncModules;
-class RGWSI_Bucket_Sync;
  
  struct rgw_cache_entry_info;
  
diff --git a/src/rgw/services/svc_user_rados.h b/src/rgw/services/svc_user_rados.h

index ff1fe41989fb23f684dcc896a726ac0e521dd301..177f720d6b18bc0643576b0f39bdacb3496d8653 100644 (file)
--- a/src/rgw/services/svc_user_rados.h
+++ b/src/rgw/services/svc_user_rados.h
@@ -20,6 +20,7 @@
  
  #include "svc_meta_be.h"
  #include "svc_user.h"
+#include "rgw_bucket.h"
  
  class RGWSI_RADOS;
  class RGWSI_Zone;
@@ -31,8 +32,6 @@ class RGWSI_MetaBackend_Handler;
  
  struct rgw_cache_entry_info;
  
-class RGWUserBuckets;
-
  class RGWGetUserHeader_CB;
  class RGWGetUserStats_CB;
author	Kaleb S. KEITHLEY <kkeithle@redhat.com>
	Sat, 21 Jan 2023 17:37:05 +0000 (12:37 -0500)
committer	Kaleb S. KEITHLEY <kkeithle@redhat.com>
	Mon, 23 Jan 2023 14:11:27 +0000 (09:11 -0500)
src/rgw/CMakeLists.txt		patch \| blob \| history
src/rgw/driver/dbstore/common/dbstore.h		patch \| blob \| history
src/rgw/driver/dbstore/common/dbstore_log.h		patch \| blob \| history
src/rgw/driver/dbstore/sqlite/sqliteDB.h		patch \| blob \| history
src/rgw/driver/rados/cls_fifo_legacy.h		patch \| blob \| history
src/rgw/driver/rados/rgw_cr_rados.h		patch \| blob \| history
src/rgw/driver/rados/rgw_cr_tools.h		patch \| blob \| history
src/rgw/driver/rados/rgw_d3n_datacache.h		patch \| blob \| history
src/rgw/driver/rados/rgw_data_sync.h		patch \| blob \| history
src/rgw/driver/rados/rgw_datalog.h		patch \| blob \| history
src/rgw/driver/rados/rgw_etag_verifier.h		patch \| blob \| history
src/rgw/driver/rados/rgw_gc.h		patch \| blob \| history
src/rgw/driver/rados/rgw_lc_tier.h		patch \| blob \| history
src/rgw/driver/rados/rgw_log_backing.h		patch \| blob \| history
src/rgw/driver/rados/rgw_metadata.h		patch \| blob \| history
src/rgw/driver/rados/rgw_object_expirer_core.h		patch \| blob \| history
src/rgw/driver/rados/rgw_otp.h		patch \| blob \| history
src/rgw/driver/rados/rgw_pubsub.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_pubsub.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_pubsub_push.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_pubsub_push.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_putobj_processor.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_putobj_processor.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_rados.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_rados.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_reshard.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_reshard.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_rest_bucket.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_rest_bucket.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_rest_log.cc	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_rest_log.h	[new file with mode: 0644]	patch \| blob
src/rgw/driver/rados/rgw_service.h		patch \| blob \| history
src/rgw/driver/rados/rgw_sync.h		patch \| blob \| history
src/rgw/driver/rados/rgw_sync_module.h		patch \| blob \| history
src/rgw/driver/rados/rgw_sync_module_aws.h		patch \| blob \| history
src/rgw/driver/rados/rgw_sync_module_es.h		patch \| blob \| history
src/rgw/driver/rados/rgw_sync_module_log.h		patch \| blob \| history
src/rgw/driver/rados/rgw_sync_trace.h		patch \| blob \| history
src/rgw/driver/rados/rgw_tools.h		patch \| blob \| history
src/rgw/driver/rados/rgw_trim_bilog.h		patch \| blob \| history
src/rgw/driver/rados/rgw_user.h		patch \| blob \| history
src/rgw/driver/rados/rgw_zone.h		patch \| blob \| history
src/rgw/rgw_acl.h		patch \| blob \| history
src/rgw/rgw_acl_s3.h		patch \| blob \| history
src/rgw/rgw_acl_swift.h		patch \| blob \| history
src/rgw/rgw_asio_client.h		patch \| blob \| history
src/rgw/rgw_asio_frontend.h		patch \| blob \| history
src/rgw/rgw_auth.h		patch \| blob \| history
src/rgw/rgw_auth_filters.h		patch \| blob \| history
src/rgw/rgw_auth_keystone.h		patch \| blob \| history
src/rgw/rgw_auth_registry.h		patch \| blob \| history
src/rgw/rgw_auth_s3.h		patch \| blob \| history
src/rgw/rgw_b64.h		patch \| blob \| history
src/rgw/rgw_basic_types.h		patch \| blob \| history
src/rgw/rgw_cache.h		patch \| blob \| history
src/rgw/rgw_client_io.h		patch \| blob \| history
src/rgw/rgw_client_io_filters.h		patch \| blob \| history
src/rgw/rgw_compression.h		patch \| blob \| history
src/rgw/rgw_coroutine.h		patch \| blob \| history
src/rgw/rgw_cors.h		patch \| blob \| history
src/rgw/rgw_cors_s3.h		patch \| blob \| history
src/rgw/rgw_cors_swift.h		patch \| blob \| history
src/rgw/rgw_crypt.h		patch \| blob \| history
src/rgw/rgw_crypt_sanitize.h		patch \| blob \| history
src/rgw/rgw_d3n_cacherequest.h		patch \| blob \| history
src/rgw/rgw_dmclock.h		patch \| blob \| history
src/rgw/rgw_dmclock_async_scheduler.h		patch \| blob \| history
src/rgw/rgw_dmclock_scheduler.h		patch \| blob \| history
src/rgw/rgw_dmclock_scheduler_ctx.h		patch \| blob \| history
src/rgw/rgw_dmclock_sync_scheduler.h		patch \| blob \| history
src/rgw/rgw_es_query.h		patch \| blob \| history
src/rgw/rgw_file.h		patch \| blob \| history
src/rgw/rgw_formats.h		patch \| blob \| history
src/rgw/rgw_frontend.h		patch \| blob \| history
src/rgw/rgw_http_client.h		patch \| blob \| history
src/rgw/rgw_http_client_curl.h		patch \| blob \| history
src/rgw/rgw_http_errors.h		patch \| blob \| history
src/rgw/rgw_iam_policy.h		patch \| blob \| history
src/rgw/rgw_iam_policy_keywords.h		patch \| blob \| history
src/rgw/rgw_keystone.h		patch \| blob \| history
src/rgw/rgw_kmip_client.h		patch \| blob \| history
src/rgw/rgw_kmip_client_impl.h		patch \| blob \| history
src/rgw/rgw_kms.h		patch \| blob \| history
src/rgw/rgw_lc.h		patch \| blob \| history
src/rgw/rgw_lc_s3.h		patch \| blob \| history
src/rgw/rgw_ldap.h		patch \| blob \| history
src/rgw/rgw_lib.h		patch \| blob \| history
src/rgw/rgw_lib_frontend.h		patch \| blob \| history
src/rgw/rgw_loadgen.h		patch \| blob \| history
src/rgw/rgw_log.h		patch \| blob \| history
src/rgw/rgw_meta_sync_status.h		patch \| blob \| history
src/rgw/rgw_multi.h		patch \| blob \| history
src/rgw/rgw_multi_del.h		patch \| blob \| history
src/rgw/rgw_object_lock.h		patch \| blob \| history
src/rgw/rgw_oidc_provider.h		patch \| blob \| history
src/rgw/rgw_op.h		patch \| blob \| history
src/rgw/rgw_opa.h		patch \| blob \| history
src/rgw/rgw_os_lib.h		patch \| blob \| history
src/rgw/rgw_period_history.h		patch \| blob \| history
src/rgw/rgw_period_puller.h		patch \| blob \| history
src/rgw/rgw_period_pusher.h		patch \| blob \| history
src/rgw/rgw_policy_s3.h		patch \| blob \| history
src/rgw/rgw_process.h		patch \| blob \| history
src/rgw/rgw_pubsub.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_pubsub.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_pubsub_push.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_pubsub_push.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_putobj_processor.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_putobj_processor.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_quota.h		patch \| blob \| history
src/rgw/rgw_rados.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_rados.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_realm_reloader.h		patch \| blob \| history
src/rgw/rgw_realm_watcher.h		patch \| blob \| history
src/rgw/rgw_request.h		patch \| blob \| history
src/rgw/rgw_reshard.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_reshard.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_resolve.h		patch \| blob \| history
src/rgw/rgw_rest_bucket.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_rest_bucket.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_rest_log.cc	[deleted file]	patch \| blob \| history
src/rgw/rgw_rest_log.h	[deleted file]	patch \| blob \| history
src/rgw/rgw_role.h		patch \| blob \| history
src/rgw/rgw_s3select.h		patch \| blob \| history
src/rgw/rgw_string.h		patch \| blob \| history
src/rgw/rgw_sts.h		patch \| blob \| history
src/rgw/rgw_swift_auth.h		patch \| blob \| history
src/rgw/rgw_tag.h		patch \| blob \| history
src/rgw/rgw_tag_s3.h		patch \| blob \| history
src/rgw/rgw_tar.h		patch \| blob \| history
src/rgw/rgw_token.h		patch \| blob \| history
src/rgw/rgw_torrent.h		patch \| blob \| history
src/rgw/rgw_usage.h		patch \| blob \| history
src/rgw/rgw_web_idp.h		patch \| blob \| history
src/rgw/rgw_website.h		patch \| blob \| history
src/rgw/rgw_xml.h		patch \| blob \| history
src/rgw/services/svc_bucket_sobj.h		patch \| blob \| history
src/rgw/services/svc_user_rados.h		patch \| blob \| history